# Script to scrape additional info from individual plant page on RHS 'Find a Plant'

In [350]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.chrome.options import Options
from datetime import date
import json
import pandas as pd
from collections import defaultdict
import re

In [351]:
options = Options()
options.headless = True
DRIVER_PATH = './chromedriver_win32/chromedriver.exe'

In [363]:
def main():
    
    driver = webdriver.Chrome(executable_path=DRIVER_PATH, options=options)
    #time.sleep(5)
    
    # Read input file
    infile = 'plants_short.txt'
    
    # read file
    with open(infile, 'r') as infile:
        indata=infile.read()

    # parse file
    plantlist = json.loads(indata)
    updatedplantlist = []
    n = len(plantlist['data'])
    
    for i,d in enumerate(plantlist['data']):
        driver.get('https://www.rhs.org.uk' + d['detail_page'])
            
        # Handle model pop-up and beta sign-in
        if driver.current_url.find('beta-optin') > 0:
            print('Opt-in')
            optin_button = WebDriverWait(driver,20).until(EC.element_to_be_clickable((By.XPATH,'//button[@class="button button--ghost button--small button--w-100 button--w-auto-sm u-m-y-0"]/span[@class="button__text"][text()="Try the new version"]')))
            if check_exists_by_xpath(driver,'//span[@id="popupCloseTH"]'):
                close_button = driver.find_element_by_xpath('//span[@id="popupCloseTH"]')
                if close_button.is_displayed():
                    close_button.click() 
            optin_button.click()

        print('Processing plant ' + str(i+1) + ' of ' + str(n) + '. URL of webpage is: ' + str(driver.current_url))
        time.sleep(3)    
        soup = BeautifulSoup(driver.page_source, "html.parser")
        d.update(get_plant_details(soup))
        updatedplantlist.append(d)
            
    dfplants = pd.DataFrame(updatedplantlist)        
    dfplants.to_json(path_or_buf='testaddcols.txt',orient='table',index=False)
        
    driver.quit()
    

In [364]:
def check_exists_by_xpath(driver, xpath):
    try:
        driver.find_element_by_xpath(xpath)
    except NoSuchElementException:
        return False
    return True

In [365]:
def find_size(soup, string):
    result = soup.find(string=string)
    if result is None:
        return ''
    else:
        return result.parent.parent.text.split(string)[1].rstrip().lstrip()

In [366]:
def get_plant_details(soup):#detail_page):
# Get additional details for individual plant

    plantinfo = {}
    today = date.today().strftime("%d-%b-%Y")
    
    plantinfo["details_query_date"] = today 
    plantinfo["ultimate_height"] = find_size(soup=soup, string="Ultimate height")
    plantinfo["time_to_ultimate_height"] = find_size(soup=soup, string="Time to ultimate height")
    plantinfo["ultimate_spread"] = find_size(soup=soup, string="Ultimate spread")

    position_dict = {'Full sun':0, 'Full shade':0, 'Partial shade':0}
    for k in position_dict:
        result = soup.find(string=k)
        if result is None:
            position_dict[k] = 0
        else:
            position_dict[k] = 1
    plantinfo["sunlight_full_sun"] = position_dict['Full sun']
    plantinfo["sunlight_full_shade"] = position_dict['Full shade']
    plantinfo["sunlight_partial_shade"] = position_dict['Partial shade']

    h = soup.find("span", string = re.compile("^H[0-9]$"))
    if h is not None:
        plantinfo["rhs_hardiness_rating"] = h.string
    else:
        plantinfo["rhs_hardiness_rating"] = ''

    f = soup.find("dt", string="Foliage")
    if f is not None:
        plantinfo["foliage"] = f.parent.contents[1].contents[0].text
    else:
        plantinfo["foliage"] = ''

    l = soup.find_all("span", {"class": "ng-star-inserted"})

    plantinfo["moisture_moist_but_well_drained"] = 0
    plantinfo["moisture_well_drained"] = 0
    plantinfo["moisture_poorly_drained"] = 0

    for i in l:
        s = i.string
        if s is not None:
            if re.sub("[^a-zA-Z]+", "", s) == "Moistbutwelldrained":
                plantinfo["moisture_moist_but_well_drained"] = 1
            if re.sub("[^a-zA-Z]+", "", s) == "Welldrained":
                plantinfo["moisture_well_drained"] = 1
            if re.sub("[^a-zA-Z]+", "", s) == "Poorlydrained":
                plantinfo["moisture_poorly_drained"] = 1

    plantinfo["acidity_acid"] = 0
    plantinfo["acidity_neutral"] = 0
    plantinfo["acidity_alkaline"] = 0

    for i in l:
        s = i.string
        if s is not None:
            if re.sub("[^a-zA-Z]+", "", s) == "Acid":
                plantinfo["acidity_acid"] = 1
            if re.sub("[^a-zA-Z]+", "", s) == "Neutral":
                plantinfo["acidity_neutral"] = 1
            if re.sub("[^a-zA-Z]+", "", s) == "Alkaline":
                plantinfo["acidity_alkaline"] = 1

    return plantinfo

In [367]:
main()

Opt-in
Processing plant 1 of 4. URL of webpage is: https://www.rhs.org.uk/plants/96782/camellia-sasanqua-jean-may-/details-beta
Processing plant 2 of 4. URL of webpage is: https://www.rhs.org.uk/plants/87195/camellia-times-williamsii-les-jury-/details-beta
Processing plant 3 of 4. URL of webpage is: https://www.rhs.org.uk/plants/97484/camellia-times-williamsii-clarrie-fawcett-/details-beta
Processing plant 4 of 4. URL of webpage is: https://www.rhs.org.uk/plants/89149/camellia-sasanqua-hugh-evans-/details-beta


In [373]:

import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.chrome.options import Options
from datetime import date
import json
import pandas as pd
from collections import defaultdict
import re
import argparse


# read file
infile = 'plants_enriched_2021Mar18-134109.txt'    
with open(infile, 'r') as infile:
    indata=infile.read()

# parse file
plantlist = json.loads(indata)
n = len(plantlist['data'])

In [374]:
plantlist['data']

[{'img_src': 'https://apps.rhs.org.uk/plantselectorimages/detail/_KOS1977.jpg',
  'botanical_name': "Camellia sasanqua 'Jean May'",
  'common_name': "camellia 'Jean May'",
  'brief_desc': "'Jean May' is a compact, bushy evergreen shrub with dark foliage and slightly fragrant, semi-double or double, shell-pink flowers to 10cm across in winter and early spring",
  'detail_page': '/plants/96782/camellia-sasanqua-jean-may-/details-beta',
  'rhs_id': '96782',
  'query_date': '18-Mar-2021',
  'agm_plant': '1',
  'num_suppliers': '4',
  'supplier_search': '/plants/nurseries-search-result?query=96782',
  'rhsplants_url': '',
  'rhsplants_price_gbp': '',
  'details_query_date': '18-Mar-2021',
  'ultimate_height': '1.5–2.5 metres',
  'time_to_ultimate_height': '10–20 years',
  'ultimate_spread': '1.5–2.5 metres',
  'sunlight_full_sun': 1,
  'sunlight_full_shade': 1,
  'sunlight_partial_shade': 1,
  'rhs_hardiness_rating': 'H4',
  'foliage': 'Evergreen',
  'moisture_moist_but_well_drained': 1,
  

In [376]:
pd.DataFrame(plantlist['data'])

Unnamed: 0,img_src,botanical_name,common_name,brief_desc,detail_page,rhs_id,query_date,agm_plant,num_suppliers,supplier_search,...,sunlight_full_shade,sunlight_partial_shade,rhs_hardiness_rating,foliage,moisture_moist_but_well_drained,moisture_well_drained,moisture_poorly_drained,acidity_acid,acidity_neutral,acidity_alkaline
0,https://apps.rhs.org.uk/plantselectorimages/de...,Camellia sasanqua 'Jean May',camellia 'Jean May',"'Jean May' is a compact, bushy evergreen shrub...",/plants/96782/camellia-sasanqua-jean-may-/deta...,96782,18-Mar-2021,1,4,/plants/nurseries-search-result?query=96782,...,1,1,H4,Evergreen,1,1,0,1,1,0
1,https://apps.rhs.org.uk/plantselectorimages/de...,Camellia × williamsii 'Les Jury',camellia 'Les Jury','Les Jury' is a bushy evergreen shrub with dar...,/plants/87195/camellia-times-williamsii-les-ju...,87195,18-Mar-2021,1,16,/plants/nurseries-search-result?query=87195,...,1,1,H5,Evergreen,1,1,0,1,1,0
2,https://apps.rhs.org.uk/plantselectorimages/de...,Camellia × williamsii 'Clarrie Fawcett',camellia 'Clarrie Fawcett',"'Clarrie Fawcett' is an evergreen, upright shr...",/plants/97484/camellia-times-williamsii-clarri...,97484,18-Mar-2021,1,0,,...,1,1,H5,Evergreen,1,1,0,1,1,0
3,https://apps.rhs.org.uk/plantselectorimages/de...,Camellia sasanqua 'Hugh Evans',camellia 'Hugh Evans','Hugh Evans' is a vigorous evergreen shrub wit...,/plants/89149/camellia-sasanqua-hugh-evans-/de...,89149,18-Mar-2021,1,12,/plants/nurseries-search-result?query=89149,...,1,1,H4,Evergreen,1,1,0,1,1,0
4,https://apps.rhs.org.uk/plantselectorimages/de...,Camellia japonica 'Mercury',camellia 'Mercury','Mercury' is a compact medium-sized shrub with...,/plants/93448/camellia-japonica-mercury-/detai...,93448,18-Mar-2021,1,1,/plants/nurseries-search-result?query=93448,...,1,1,H5,Evergreen,1,1,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,,Camellia japonica 'Fire Falls',camellia 'Fire Falls',,/plants/52912/camellia-japonica-fire-falls-/de...,52912,18-Mar-2021,1,0,,...,0,0,,,0,0,0,0,0,0
95,,Camellia japonica 'Deep Secret',camellia 'Deep Secret',,/plants/85378/camellia-japonica-deep-secret-/d...,85378,18-Mar-2021,1,0,,...,0,0,,,0,0,0,0,0,0
96,,Camellia japonica 'Tom Thumb',camellia 'Tom Thumb',,/plants/62619/camellia-japonica-tom-thumb-/det...,62619,18-Mar-2021,1,4,/plants/nurseries-search-result?query=62619,...,0,0,,,0,0,0,0,0,0
97,,Camellia × williamsii 'Mary Phoebe Taylor',camellia 'Mary Phoebe Taylor',,/plants/47666/camellia-times-williamsii-mary-p...,47666,18-Mar-2021,1,4,/plants/nurseries-search-result?query=47666,...,0,0,,,0,0,0,0,0,0
