In [None]:
! pip3 install selenium
! pip3 install tqdm

In [5]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import pandas as pd
from tqdm import tqdm

def collect_matching_image_attributes(url):
    try:
        # Set up Chrome options to run headless
        chrome_options = Options()
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--disable-gpu')
        
        # Create a new instance of the Chrome driver
        driver = webdriver.Chrome(options=chrome_options)
        
        # Open the website
        driver.get(url)
        
        # Allow time for the page to load (you may need to adjust the sleep duration)
        time.sleep(5)
        
        # Execute the provided JavaScript code on the page
        matching_images = driver.execute_script("""
            const prefix = 'https://cdn.download.ams.birds.cornell.edu/api/v1/asset/';
            const images = document.querySelectorAll('img');

            const matchingImages = Array.from(images).filter((img) => img.src.startsWith(prefix)).map((fimg) => {
                const alt = fimg.alt.split(' - ')[0].trim();
                const srcParts = fimg.src.split('/');
                const src = srcParts[srcParts.length - 2];

                return { alt, src };
            });

            return matchingImages;
        """)

        return matching_images

    except Exception as e:
        print(f"An error occurred while collecting matching image attributes: {str(e)}")

    finally:
        # Close the browser window
        driver.quit()

# Find last index saved as set as our starting point
from os import listdir
from os.path import isfile, join
latest_species_scraped = [int(f.split('_')[-1]) for f in listdir('data/image_downloads/') if isfile(join('data/image_downloads', f))]
starting_point = max(latest_species_scraped)

# Import list of bird species from ebird txonomy 2023 file
taxonomy = pd.read_csv('../ebird_taxonomy_v2023.csv')
species_list = taxonomy[starting_point:][taxonomy.CATEGORY=='species'].SPECIES_CODE

# Initialise variables
df_birds = pd.DataFrame()
index = 0
index = 0

# Iterate through each website and collect matching image attributes. Export CSV every 25 species.
for species in tqdm(species_list):
    matching_image_attributes = collect_matching_image_attributes(f"https://media.ebird.org/catalog?taxonCode={species}&sort=rating_rank_desc&mediaType=photo")
    df_images = pd.DataFrame.from_records(matching_image_attributes).assign(species_code=species)
    df_birds = pd.concat([df_birds, df_images], ignore_index=True)

    index += 1
    # Export dataframe as csv every 25 species
    if index%25 == 0:
        df_birds[index-24:index].to_csv(f'../image_downloads/image_downloads_{index-24}_to_{index}.csv')
    

  2%|▏         | 197/11017 [27:59<25:24:49,  8.46s/it]