In [1]:
# Import dependencies
from time import sleep
import pandas as pd
from splinter import Browser
from splinter.exceptions import ElementDoesNotExist
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup as bs

In [2]:
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

[WDM] - Current google-chrome version is 85.0.4183
[WDM] - Get LATEST driver version for 85.0.4183
[WDM] - Get LATEST driver version for 85.0.4183


 


[WDM] - Trying to download new driver from http://chromedriver.storage.googleapis.com/85.0.4183.87/chromedriver_win32.zip
[WDM] - Driver has been saved in cache [C:\Users\pooja\.wdm\drivers\chromedriver\win32\85.0.4183.87]


## Extract: scrape animal data from zoo.org.au

In [3]:
zoo_url = "https://www.zoo.org.au/fighting-extinction/local-threatened-species/"

# Use splinter to navigate the site
browser.visit(zoo_url)

In [4]:
# Create BeautifulSoup object; parse with 'lxml'
html = browser.html
soup = bs(html, 'lxml')

In [5]:
# Retrieve the htmls of the links to each animal
animal_html_list = soup.find_all("div", class_="feature-tile__title")

In [6]:
# Retrieve the animal names in all of the above htmls
animal_list = []

for animal in animal_html_list:
    animal_list.append(animal.get_text().split("(")[0].strip())

In [7]:
animal_list

['Alpine She-oak Skink',
 'Baw Baw Frog',
 'Brush-tailed Rock-wallaby',
 'Eastern Barred Bandicoot',
 'Giant Burrowing Frog',
 'Golden-rayed Blue Butterfly',
 'Grassland Earless Dragon',
 'Guthega Skink',
 'Helmeted Honeyeater',
 'Key’s Matchstick Grasshopper',
 'Large Brown Tree Frog',
 "Leadbeater's Possum",
 'Lord Howe Island Stick Insect',
 'Mallee Emu-wren',
 'Mountain Pygmy-possum',
 'New Holland Mouse',
 'Northern Corroboree Frog',
 'Orange-bellied Parrot',
 'Plains-wanderer',
 'Regent Honeyeater',
 'Smoky Mouse',
 'Southern Bent-wing Bat',
 'Southern Corroboree Frog',
 'Spotted Tree Frog',
 'Stuttering Barred Frog',
 'Swift Parrot',
 'Tasmanian Devil']

In [8]:
# Click the first link under "Learn More" to go to the first animal page
browser.links.find_by_partial_text("Learn More").click()

In [9]:
# Loop through all of the animals by interacting with the next element in navigation bar
animal_info_image_zoo = list()

for item in animal_list:

    sleep(1)
    html = browser.html
    soup = bs(html, 'lxml')

    # Retrieve the first animal image url
    image_url = soup.select_one("picture img")["src"]
    if image_url.startswith("/media"):
        full_image_url = f"https://www.zoo.org.au{image_url}"
        print(full_image_url)
    else:
        full_image_url = image_url
        print(full_image_url)

    # Retrieve the first animal image alternative
    image_alternative = soup.select_one("picture img")["alt"]
        
    # Retrieve the introduction paragraph
    intro = soup.find("p", class_="intro").text.strip()

    # Retrieve some info on threats
    try:
        threat_paragraph = soup.find("div", class_="row-wrapper--right-weighted-column").text.strip()
    except AttributeError:
        results = soup.find_all("p")
        threat_paragraph = [result.text.strip() for result in results][1:4]
        
    # Append a dictionary with the above info to a list
    animal_info_image_zoo.append({
        "animal_name": item,
        "image_url": full_image_url,
        "image_alternative": image_alternative,
        "introduction": intro,
        "threat_paragraph": threat_paragraph
    })

    # Click the next animal page in navigation bar down to the bottom of the page
    browser.find_by_xpath('//*[@id="main-content"]/nav/div/div/div/div[2]/nav/a[2]').click()

https://www.zoo.org.au/media/2050/1023_alpine_she-oak_skink_healesville_sanctuary1.jpg?anchor=center&mode=crop&quality=75&width=2000&height=570&rnd=132131638000000000
https://www.zoo.org.au/media/2052/21295_baw_baw_frog_melbourne_zoo1.jpg?anchor=center&mode=crop&quality=75&width=2000&height=570&rnd=132131638840000000
https://www.zoo.org.au/media/2045/21882_brush-tailed_rock-wallabies_healesville_sanctuary.jpg?center=0.46198830409356723,0.31833333333333336&mode=crop&quality=75&width=2000&height=570&rnd=132131636770000000
https://www.zoo.org.au/media/2053/4376_eastern_barred_bandicoot_in_bandicoot_hideout_werribee_open_range_zoo.jpg?center=0.48333333333333334,0.37333333333333335&mode=crop&quality=75&width=2000&height=570&rnd=132131642280000000
https://www.zoo.org.au/media/2056/23479_giant_burrowing_frog_-_credit_required_offsite1.jpg?anchor=center&mode=crop&quality=75&width=2000&height=570&rnd=132131643480000000
https://www.zoo.org.au/media/1703/animal-detail-page_0001_golden-rayed-blue-

## Extract: scrape animal data from environment.vic.gov.au

In [10]:
# URL of page to be scraped
url = "https://www.environment.vic.gov.au/conserving-threatened-species/threatened-species"

# Use splinter to navigate the site
browser.visit(url)

In [11]:
# Create BeautifulSoup object; parse with 'lxml'
html = browser.html
soup = bs(html, 'lxml')

In [12]:
# Retrieve all text with links
text_with_links = soup.find_all(class_ = 'internal-link')

for text in text_with_links:
    print(text.get_text())

Caring for Our Environment - Faunal Emblems Program
Baw Baw Frog
Brush-tailed Rock-wallaby
Eastern Barred Bandicoot
Greater Glider
Helmeted Honeyeater
Hooded Plover
Leadbeater's Possum
Macquarie Perch (PDF, 397.0 KB)
Mountain Pygmy-possum
Orange-bellied Parrot
Plains-wanderer
Regent Honeyeater
Mountain Swainson-pea Swainsona recta (PDF, 330.7 KB)
Concave Pomaderris (Pomaderris subplicata) (PDF, 1.5 MB)
Phantom Wattle  (Acacia phasmoides) (PDF, 331.8 KB)
Stony Bush-pea (Pultanea lapidosa) (PDF, 2.7 MB)


In [13]:
# As we are only interested in fauna, we'll only keep animal names.
text_to_search = [text.get_text() for text in text_with_links][1: -4]
text_to_search

['Baw Baw Frog',
 'Brush-tailed Rock-wallaby',
 'Eastern Barred Bandicoot',
 'Greater Glider',
 'Helmeted Honeyeater',
 'Hooded Plover',
 "Leadbeater's Possum",
 'Macquarie Perch (PDF, 397.0 KB)',
 'Mountain Pygmy-possum',
 'Orange-bellied Parrot',
 'Plains-wanderer',
 'Regent Honeyeater']

In [14]:
animal_info_image_environment = []

for i in range(len(text_to_search)):
    
    # Click each of the links to the threatened species.
    browser.find_by_xpath(f'//*[@id="content_container_70299"]/p[{i+1}]/a').click()
    sleep(1)
    html = browser.html
    soup = bs(html, 'lxml')

    # Escape from the page with pdf file (not html)
    if text_to_search[i] == "Macquarie Perch (PDF, 397.0 KB)":
        browser.visit(url)
        sleep(1)
        
    else:
        # Retrieve animal image url if available
        try:
            full_image_url = soup.select_one("div.content-wrapper__col-content img")["src"]
            print(full_image_url)
        except TypeError:
            full_image_url = None

        # Retrieve the species info section
        try:
            intro = soup.find("div", class_="col-xs-12 col-sm-6 feature-box").text.strip()
        except AttributeError:
            results = soup.find_all("div", class_="row top-xs")
            intro = [result.text.strip() for result in results][0]

        # Retrieve the threat info section
        try:
            threat_paragraph = soup.find("div", class_="col-xs-12 feature-box").text.strip()
        except AttributeError:
            results = soup.find_all("div", class_="row top-xs")
            threat_paragraph = [result.text.strip() for result in results][1]

        # Append the dictionary with the above info to a list
        animal_info_image_environment.append({
            "animal_name": text_to_search[i],
            "image_url": full_image_url,
            "image_alternative": text_to_search[i],
            "introduction": intro,
            "threat_paragraph": threat_paragraph
        })
        
        # Back to the starting url
        browser.visit(url) 
        sleep(1)

https://www.environment.vic.gov.au/__data/assets/image/0027/429534/Baw-Baw-Frog.jpg
https://www.environment.vic.gov.au/__data/assets/image/0030/428556/Eastern-Barred-Bandicoot.jpg
https://www.environment.vic.gov.au/__data/assets/image/0024/440367/Greater-Glider-black-Steve-Smith.jpg
https://www.environment.vic.gov.au/__data/assets/image/0031/427846/Helmeted-Honeyeater.jpg
https://www.environment.vic.gov.au/__data/assets/image/0030/428808/IMG_3825lr.jpg
https://www.environment.vic.gov.au/__data/assets/image/0033/427983/Lowland-LBP_Zoos-Victoria.JPG
https://www.environment.vic.gov.au/__data/assets/image/0026/428930/28310791230_680f9be730_k.jpg
https://www.environment.vic.gov.au/__data/assets/image/0024/428640/OBP.jpg
https://www.environment.vic.gov.au/__data/assets/image/0029/428618/Plains-wanderer.jpg
https://www.environment.vic.gov.au/__data/assets/image/0025/429514/2015-Yellow-Yellow-Red-Metal-YYRM_Dean-Ingwersen.jpg


In [15]:
# Quit the browser after scraping
browser.quit()

## Consolidate data scraped from zoo.org.au and environment.vic.gov.au

In [16]:
# Overview of data scraped from environment.vic.gov.au
animal_info_image_environment_df = pd.DataFrame(animal_info_image_environment)
animal_info_image_environment_df.head()

Unnamed: 0,animal_name,image_url,image_alternative,introduction,threat_paragraph
0,Baw Baw Frog,https://www.environment.vic.gov.au/__data/asse...,Baw Baw Frog,Species information\n\nEndemic to the Mt Baw B...,Threats\n\nThe wild population of Baw Baw Frog...
1,Brush-tailed Rock-wallaby,,Brush-tailed Rock-wallaby,Species Information\n\nThe Southern Evolutiona...,Species Information\n\nThe Southern Evolutiona...
2,Eastern Barred Bandicoot,https://www.environment.vic.gov.au/__data/asse...,Eastern Barred Bandicoot,Species information\n\nThe Eastern Barred Band...,Threats\n\nLoss of their grassland habitat and...
3,Greater Glider,https://www.environment.vic.gov.au/__data/asse...,Greater Glider,,Species InformationThe Greater Glider is Austr...
4,Helmeted Honeyeater,https://www.environment.vic.gov.au/__data/asse...,Helmeted Honeyeater,Species information\n\nThis subspecies of the ...,Threats\n\nAs numbers of Helmeted Honeyeaters ...


In [17]:
# Overview of data scraped from zoo.org.au
animal_info_image_zoo_df = pd.DataFrame(animal_info_image_zoo)
animal_info_image_zoo_df.head()

Unnamed: 0,animal_name,image_url,image_alternative,introduction,threat_paragraph
0,Alpine She-oak Skink,https://www.zoo.org.au/media/2050/1023_alpine_...,Alpine She-oak Skink sunning it self on a rock...,Found in only a few locations in Victoria and ...,Major threats\nFire is a huge danger to the Al...
1,Baw Baw Frog,https://www.zoo.org.au/media/2052/21295_baw_ba...,Baw Baw Frog resting in bright green moss. Loo...,All estimates point to extinction in the wild ...,Major threats\nThe loss of the Baw Baw Frog is...
2,Brush-tailed Rock-wallaby,https://www.zoo.org.au/media/2045/21882_brush-...,Brush Tailed Rock Wallabies resting in the grass.,"In Victoria, the Brush-tailed Rock-wallaby now...",Major threats \nChanges to habitat and the imp...
3,Eastern Barred Bandicoot,https://www.zoo.org.au/media/2053/4376_eastern...,Small Eastern Barred Bandicoot side view forag...,The Eastern Barred Bandicoot is listed as exti...,The plan for recovery\nZoos Victoria has partn...
4,Giant Burrowing Frog,https://www.zoo.org.au/media/2056/23479_giant_...,Giant Burrowing Frog on wet rocks side view. T...,Although we know that populations of the Giant...,"[Over the next five years, Zoos Victoria will ..."


In [18]:
# Outer merge to see which animals in the zoo site but not in the environment site and vice versa
animal_info_image_df = pd.merge(animal_info_image_zoo_df, animal_info_image_environment_df,
                                on="animal_name", how="outer", suffixes=("_zoo", "_envi"))
animal_info_image_df

Unnamed: 0,animal_name,image_url_zoo,image_alternative_zoo,introduction_zoo,threat_paragraph_zoo,image_url_envi,image_alternative_envi,introduction_envi,threat_paragraph_envi
0,Alpine She-oak Skink,https://www.zoo.org.au/media/2050/1023_alpine_...,Alpine She-oak Skink sunning it self on a rock...,Found in only a few locations in Victoria and ...,Major threats\nFire is a huge danger to the Al...,,,,
1,Baw Baw Frog,https://www.zoo.org.au/media/2052/21295_baw_ba...,Baw Baw Frog resting in bright green moss. Loo...,All estimates point to extinction in the wild ...,Major threats\nThe loss of the Baw Baw Frog is...,https://www.environment.vic.gov.au/__data/asse...,Baw Baw Frog,Species information\n\nEndemic to the Mt Baw B...,Threats\n\nThe wild population of Baw Baw Frog...
2,Brush-tailed Rock-wallaby,https://www.zoo.org.au/media/2045/21882_brush-...,Brush Tailed Rock Wallabies resting in the grass.,"In Victoria, the Brush-tailed Rock-wallaby now...",Major threats \nChanges to habitat and the imp...,,Brush-tailed Rock-wallaby,Species Information\n\nThe Southern Evolutiona...,Species Information\n\nThe Southern Evolutiona...
3,Eastern Barred Bandicoot,https://www.zoo.org.au/media/2053/4376_eastern...,Small Eastern Barred Bandicoot side view forag...,The Eastern Barred Bandicoot is listed as exti...,The plan for recovery\nZoos Victoria has partn...,https://www.environment.vic.gov.au/__data/asse...,Eastern Barred Bandicoot,Species information\n\nThe Eastern Barred Band...,Threats\n\nLoss of their grassland habitat and...
4,Giant Burrowing Frog,https://www.zoo.org.au/media/2056/23479_giant_...,Giant Burrowing Frog on wet rocks side view. T...,Although we know that populations of the Giant...,"[Over the next five years, Zoos Victoria will ...",,,,
5,Golden-rayed Blue Butterfly,https://www.zoo.org.au/media/1703/animal-detai...,Golden Rayed Blue Butterfly resting with wings...,This Victorian butterfly is critically endange...,[It lives in the narrow bands of remnant flood...,,,,
6,Grassland Earless Dragon,https://www.zoo.org.au/media/1716/23200_grassl...,Grassland Earless Dragon lizard side view look...,The Grassland Earless Dragon has not appeared ...,The plan for recovery\nZoos Victoria has been ...,,,,
7,Guthega Skink,https://www.zoo.org.au/media/1980/10411_gutheg...,Baby Guthega Skink side view. Lizard looking t...,The Guthega Skink is endangered in Victoria an...,Major threats\nHabitat disturbance caused by g...,,,,
8,Helmeted Honeyeater,https://www.zoo.org.au/media/1681/18660_juveni...,Juvenile Helmeted Honeyeater standing on a tre...,"The Helmeted Honeyeater, the bird emblem for V...",Major threats\nLoss of habitat is the primary ...,https://www.environment.vic.gov.au/__data/asse...,Helmeted Honeyeater,Species information\n\nThis subspecies of the ...,Threats\n\nAs numbers of Helmeted Honeyeaters ...
9,Key’s Matchstick Grasshopper,https://www.zoo.org.au/media/1702/animal-detai...,Key's Matchstick Grasshopper. Its brown with a...,This distinctive grasshopper is flightless and...,[The Key’s Matchstick Grasshopper hasn’t been ...,,,,


In [19]:
# Retrieve the animal list in the environment site but not in the zoo site
animals_in_envi_not_in_zoo = animal_info_image_df[animal_info_image_df[
    "image_url_zoo"].isnull()]["animal_name"].to_list()
animals_in_envi_not_in_zoo

['Greater Glider', 'Hooded Plover']

In [20]:
# Extract the data of the animals in the environment site but not in the zoo site
animals_in_envi_add_to_zoo = animal_info_image_environment_df[
    animal_info_image_environment_df["animal_name"].isin(animals_in_envi_not_in_zoo)]
animals_in_envi_add_to_zoo

Unnamed: 0,animal_name,image_url,image_alternative,introduction,threat_paragraph
3,Greater Glider,https://www.environment.vic.gov.au/__data/asse...,Greater Glider,,Species InformationThe Greater Glider is Austr...
5,Hooded Plover,https://www.environment.vic.gov.au/__data/asse...,Hooded Plover,Species Information\n\nHooded Plovers are a re...,Threats\n\nHooded Plovers often share their oc...


In [21]:
# Consolidate data from both sites (keeping all of the ones in the zoo site and adding the extra in the environment site)
final_animal_image_df = pd.concat([animal_info_image_zoo_df, animals_in_envi_add_to_zoo], axis=0)
final_animal_image_df

Unnamed: 0,animal_name,image_url,image_alternative,introduction,threat_paragraph
0,Alpine She-oak Skink,https://www.zoo.org.au/media/2050/1023_alpine_...,Alpine She-oak Skink sunning it self on a rock...,Found in only a few locations in Victoria and ...,Major threats\nFire is a huge danger to the Al...
1,Baw Baw Frog,https://www.zoo.org.au/media/2052/21295_baw_ba...,Baw Baw Frog resting in bright green moss. Loo...,All estimates point to extinction in the wild ...,Major threats\nThe loss of the Baw Baw Frog is...
2,Brush-tailed Rock-wallaby,https://www.zoo.org.au/media/2045/21882_brush-...,Brush Tailed Rock Wallabies resting in the grass.,"In Victoria, the Brush-tailed Rock-wallaby now...",Major threats \nChanges to habitat and the imp...
3,Eastern Barred Bandicoot,https://www.zoo.org.au/media/2053/4376_eastern...,Small Eastern Barred Bandicoot side view forag...,The Eastern Barred Bandicoot is listed as exti...,The plan for recovery\nZoos Victoria has partn...
4,Giant Burrowing Frog,https://www.zoo.org.au/media/2056/23479_giant_...,Giant Burrowing Frog on wet rocks side view. T...,Although we know that populations of the Giant...,"[Over the next five years, Zoos Victoria will ..."
5,Golden-rayed Blue Butterfly,https://www.zoo.org.au/media/1703/animal-detai...,Golden Rayed Blue Butterfly resting with wings...,This Victorian butterfly is critically endange...,[It lives in the narrow bands of remnant flood...
6,Grassland Earless Dragon,https://www.zoo.org.au/media/1716/23200_grassl...,Grassland Earless Dragon lizard side view look...,The Grassland Earless Dragon has not appeared ...,The plan for recovery\nZoos Victoria has been ...
7,Guthega Skink,https://www.zoo.org.au/media/1980/10411_gutheg...,Baby Guthega Skink side view. Lizard looking t...,The Guthega Skink is endangered in Victoria an...,Major threats\nHabitat disturbance caused by g...
8,Helmeted Honeyeater,https://www.zoo.org.au/media/1681/18660_juveni...,Juvenile Helmeted Honeyeater standing on a tre...,"The Helmeted Honeyeater, the bird emblem for V...",Major threats\nLoss of habitat is the primary ...
9,Key’s Matchstick Grasshopper,https://www.zoo.org.au/media/1702/animal-detai...,Key's Matchstick Grasshopper. Its brown with a...,This distinctive grasshopper is flightless and...,[The Key’s Matchstick Grasshopper hasn’t been ...


In [22]:
# Export to csv for merging against the vba fauna data
final_animal_image_df.to_csv("../data/animal_image_to_merge.csv", index=False)