In [1]:
import random
from time import sleep

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from selenium.webdriver.common.action_chains import ActionChains

In [2]:
options = webdriver.ChromeOptions()
options.add_argument("--incognito")
options.add_argument("--headless")
timeout=10

num_of_spp=782 # according to the web page

In [3]:
browser = webdriver.Chrome(executable_path='./chromedriver.exe',chrome_options=options)

  """Entry point for launching an IPython kernel.


In [4]:
browser.get("https://www.webambiente.gov.br/publico/especies.xhtml")

### Helper functions

In [5]:
slugify = lambda string: string.lower().strip().replace(' ','_').replace('.','')


# saves a species page html
def saveSpeciesPageHtml(browser,path='./species_html/'):
    selector = "div.conteudo#divmain"
    WebDriverWait(browser,timeout).until(EC.visibility_of_all_elements_located( (By.CSS_SELECTOR, selector) ))
    content = browser.find_element_by_css_selector(selector)
    spName = content.find_element_by_css_selector("div.conteudo_util h1").text
    with open( f"{path}{slugify(spName)}.html", 'w', encoding="utf-8") as f:
        f.write(content.get_attribute("innerHTML"))
        
    return (spName, content)


def getRows(browser):
    selector = "form#selectEspecieFormTabela"
    WebDriverWait(browser,timeout).until(EC.visibility_of_element_located( (By.CSS_SELECTOR, selector)))
    speciesTable = browser.find_element_by_css_selector(selector)
    rows = speciesTable.find_elements_by_tag_name('tr')[1:] # exclude header
    return rows

def sequenceToPage(n):
    """
    Returns the sequence to navigate to a page
    """
    path=[]

    # is visible by default
    if n<=10:
        path = [n]

    # navigate forward
    elif n<=39:
        path = list(range( 10,n,4 ))
        if path[-1]!=n:
            path.append(n)

    # navigate backwards
    else:
        path.append('Last Page')
        if n>=70 and n!=79:
            path.append(n)
        else:
            path += list(range(70,n,-5))
            if path[-1]!=n:
                path.append(n)  
    return path



# These functions navigate to other pages
def goback(browser):
    xpath = "//button/span[text()='Voltar']"
    WebDriverWait(browser,timeout).until(EC.visibility_of_element_located( (By.XPATH, xpath)) )
    browser.find_element_by_xpath(xpath).click()
    
def gotopage(browser, pageNum):
    for p in sequenceToPage(pageNum):
        if p=='Last Page':
            xpath=f"//a[@aria-label='Last Page']"
        else:
            xpath = f"//a[@aria-label='Page {p}']"
        WebDriverWait(browser,timeout).until(EC.visibility_of_element_located( (By.XPATH, xpath)) )
        browser.find_element_by_xpath(xpath).click()
    

def gotospeciespage(browser, rowNum):
    getRows(browser)[rowNum].click()
                                    

Rows and columns

In [6]:
pagination_size = 10
num_of_spp = 782

pages_rows = [ (i+1, [ j for j in range(0,pagination_size) if i*pagination_size+(j+1) <=num_of_spp ] ) 
                    for i in range(num_of_spp//pagination_size+1) ]

# Enter page 1 of the species list

In [7]:
WebDriverWait(browser,timeout).until(EC.visibility_of_element_located( (By.ID,"j_idt78:selectEspecieForm:j_idt90") ))
searchAll_button = browser.find_element_by_id("j_idt78:selectEspecieForm:j_idt90")

In [8]:
searchAll_button.click()

# Get the first 10 species (page 1)

### Collect data from the first 10 species

In [416]:
from time import sleep
import random

In [516]:
page,rows = pages_rows[1] # first page (first 10 rows)

for r in rows:
    gotopage(browser,page)
    sleep(2)
    gotospeciespage(browser,r)
    
    sleep(2)
    
    spname,content = saveSpeciesPageHtml(browser)
    print(f"Collected for {spname}")
    goback(browser)
    
    sleep(random.random()*3 + 3) # delay between 3 and 6 secs

Collected for Actinocladum verticillatum (Nees) McClure ex Soderstr.
Collected for Adesmia bicolor (Poir.) DC.


KeyboardInterrupt: 

# Get the entire dataset

In [9]:
for page,rows in pages_rows:
    for r in rows:
        try:
            gotopage(browser,page)
            sleep(1)
            gotospeciespage(browser,r)
            sleep(1)
            
            spname,content = saveSpeciesPageHtml(browser)
            print(f"Page {page}, row {r}: Collected data for {spname}")

        except Exception as e:
            print(f"ERROR: A problem occurred in Page {page}, row {r}")
            print(str(e))
        
            
        finally:
            goback(browser)
            sleep(random.random()*3 + 3) # delay between 3 and 6 secs
            


Page 1, row 0: Collected data for Abarema brachystachya (DC.) Barneby & J.W.Grimes
Page 1, row 1: Collected data for Abarema cochleata (Willd.) Barneby & J.W.Grimes
Page 1, row 2: Collected data for Abarema floribunda (Spruce ex Benth.) Barneby & J.W.Grimes
Page 1, row 3: Collected data for Abarema jupunba (Willd.) Britton & Killip.
Page 1, row 4: Collected data for Abarema piresii Barneby & J.W.Grimes
Page 1, row 5: Collected data for Acanthosyris spinescens (Mart. & Eichler) Griseb.
Page 1, row 6: Collected data for Acca sellowiana (O.Berg) Burret
Page 1, row 7: Collected data for Achyrocline satureioides (Lam.) DC.
Page 1, row 8: Collected data for Acioa edulis Prance
Page 1, row 9: Collected data for Acrocomia aculeata (Jacq) Lodd. ex Mart.
Page 2, row 0: Collected data for Actinocladum verticillatum (Nees) McClure ex Soderstr.
Page 2, row 1: Collected data for Adesmia bicolor (Poir.) DC.
Page 2, row 2: Collected data for Adesmia latifolia (Spreng.) Vogel
Page 2, row 3: Collected d

Page 13, row 2: Collected data for Blepharocalyx salicifolius (Kunth) O.Berg
Page 13, row 3: Collected data for Bowdichia nitida Spruce ex Benth
Page 13, row 4: Collected data for Bowdichia virgilioides Kunth
Page 13, row 5: Collected data for Bromelia balansae Mez
Page 13, row 6: Collected data for Bromus auleticus Trin. ex Nees
Page 13, row 7: Collected data for Bromus catharticus Vahl
Page 13, row 8: Collected data for Brosimum gaudichaudii Trécul
Page 13, row 9: Collected data for Brosimum guianense (Aubl.) Huber
Page 14, row 0: Collected data for Brosimum parinarioides Ducke
Page 14, row 1: Collected data for Brosimum rubescens Taub.
Page 14, row 2: Collected data for Buchenavia grandis Ducke
Page 14, row 3: Collected data for Buchenavia tomentosa Eichler
Page 14, row 4: Collected data for Butia leiospatha (Barb.Rodr.) Becc.
Page 14, row 5: Collected data for Butia odorata (Barb. Rodr.) Noblick
Page 14, row 6: Collected data for Butia yatay (Mart.) Becc.
Page 14, row 7: Collected 

Page 25, row 6: Collected data for Cupania vernalis Cambess.
Page 25, row 7: Collected data for Curatella americana L.
Page 25, row 8: Collected data for Cybistax antisyphillitica (Mart.) Mart.
Page 25, row 9: Collected data for Dalbergia cearensis Ducke
Page 26, row 0: Collected data for Dalbergia miscolobium Benth.
Page 26, row 1: Collected data for Dalbergia nigra (Vell.) Allemão ex Benth
Page 26, row 2: Collected data for Dalbergia spruceana Benth.
Page 26, row 3: Collected data for Danthonia montevidensis Hack. & Arechav.
Page 26, row 4: Collected data for Dasyphyllum spinescens (Less.) Cabrera
Page 26, row 5: Collected data for Davilla elliptica A.St.-Hil.
Page 26, row 6: Collected data for Desmodium adscendens (Sw.) DC.
Page 26, row 7: Collected data for Desmodium barbatum (L.) Benth.
Page 26, row 8: Collected data for Desmodium incanum (Sw.) DC.
Page 26, row 9: Collected data for Dialium guianense (Aubl.) Sandwith
Page 27, row 0: Collected data for Dichanthelium sabulorum (Lam.

Page 37, row 8: Collected data for Hymenaea courbaril L.
Page 37, row 9: Collected data for Hymenaea martiana Hayne
Page 38, row 0: Collected data for Hymenaea parvifolia Huber
Page 38, row 1: Collected data for Hymenaea stigonocarpa Mart. ex Hayne
Page 38, row 2: Collected data for Hymenolobium excelsum Ducke
Page 38, row 3: Collected data for Hymenolobium petraeum Ducke
Page 38, row 4: Collected data for Ilex brevicuspis Reissek
Page 38, row 5: Collected data for Ilex dumosa Reissek
Page 38, row 6: Collected data for Ilex paraguariensis A. St.-Hil.
Page 38, row 7: Collected data for Espécies Nativas
Page 38, row 8: Collected data for Inga alba (Sw.) Willd.
Page 38, row 9: Collected data for Inga capitata Desv.
Page 39, row 0: Collected data for Inga cinnamomea Spruce ex Benth.
Page 39, row 1: Collected data for Inga cylindrica (Vell.) Mart.
Page 39, row 2: Collected data for Inga edulis Mart.
Page 39, row 3: Collected data for Inga laurina (Sw.) Willd.
Page 39, row 4: Collected data 

Page 50, row 1: Collected data for Myrcia splendens DC.
Page 50, row 2: Collected data for Myrcia tomentosa (Aubl.) DC.
Page 50, row 3: Collected data for Myrocarpus frondosus Allemão
Page 50, row 4: Collected data for Myroxylon peruiferum L.f.
Page 50, row 5: Collected data for Myrrhinium atropurpureum Schott
Page 50, row 6: Collected data for Myrsine coriacea (Sw.) R.Br. ex Roem. & Schult.
Page 50, row 7: Collected data for Myrsine guianensis (Aubl.) Kuntze
Page 50, row 8: Collected data for Myrsine laetevirens (Mez) Arechav.
Page 50, row 9: Collected data for Myrsine parvula (Mez) Otegui
Page 51, row 0: Collected data for Myrsine umbellata (Mart.) Mez
Page 51, row 1: Collected data for Nectandra cuspidata Nees
Page 51, row 2: Collected data for Nectandra lanceolata Nees
Page 51, row 3: Collected data for Nectandra leucantha Nees
Page 51, row 4: Collected data for Nectandra megapotamica (Sprengel) Mez
Page 51, row 5: Collected data for Nectandra membranacea (Sw.) Griseb.
Page 51, row

Page 62, row 3: Collected data for Qualea grandiflora Mart.
Page 62, row 4: Collected data for Qualea paraensis Ducke
Page 62, row 5: Collected data for Qualea parviflora Mart.
Page 62, row 6: Collected data for Quararibea cordata (Bonpl.) Vischer
Page 62, row 7: Collected data for Quillaja brasiliensis (A.St.-Hil. & Tul.) Mart.
Page 62, row 8: Collected data for Rhamnidium elaeocarpum Reissek
Page 62, row 9: Collected data for Rhizophora mangle L.
Page 63, row 0: Collected data for Rhynchosia corylifolia Mart. ex Benth.
Page 63, row 1: Collected data for Rhynchosia diversifolia Micheli
Page 63, row 2: Collected data for Rhynchospora barrosiana Guagl.
Page 63, row 3: Collected data for Rhynchospora rugosa (Vahl) Gale
Page 63, row 4: Collected data for Rhynchospora tenuis Link
Page 63, row 5: Collected data for Riencourtia oblongifolia Gardner
Page 63, row 6: Collected data for Rinorea guianensis Aubl.
Page 63, row 7: Collected data for Roupala montana Aubl.
Page 63, row 8: Collected da

Page 74, row 3: Collected data for Uncaria guianensis (Aubl.) J.F.Gmel.
Page 74, row 4: Collected data for Urochloa decumbens (Stapf) R.D.Webster
Page 74, row 5: Collected data for Urochloa humidicola (Rendle) Morrone & Zuloaga
Page 74, row 6: Collected data for Vachellia caven (Molina) Seigler & Ebinger
Page 74, row 7: Collected data for Vachellia farnesiana (L.) Wight & Arn.
Page 74, row 8: Collected data for Vatairea guianensis Aubl.
Page 74, row 9: Collected data for Vatairea macrocarpa (Benth.) Ducke
Page 75, row 0: Collected data for Vatairea paraensis Ducke
Page 75, row 1: Collected data for Vataireopsis speciosa Ducke
Page 75, row 2: Collected data for Vellozia dasypus Seub.
Page 75, row 3: Collected data for Vellozia epidendroides Mart. ex Schult. & Schult.f.
Page 75, row 4: Collected data for Vellozia squamata Pohl
Page 75, row 5: Collected data for Vigna luteola (Jacq.) Benth.
Page 75, row 6: Collected data for Virola michelii Heckel
Page 75, row 7: Collected data for Virola