In [98]:
from selenium import webdriver
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException

from bs4 import BeautifulSoup

import time
import numpy as np
import pandas as pd

# configure Firefox Driver
def configure_firefox_driver():
    # Add additional Options to the webdriver
    firefox_options = FirefoxOptions()
    # add the argument and make the browser Headless.
    firefox_options.add_argument("--headless")

    # Instantiate the Webdriver: Mention the executable path of the webdriver you have downloaded
    # if driver is in PATH, no need to provide executable_path
    driver = webdriver.Firefox(executable_path = "./geckodriver", options = firefox_options)
    return driver

In [73]:
driver = configure_firefox_driver()

In [76]:
SCROLL_PAUSE_TIME = 5
base_url = 'https://www.flickr.com/photos/sentinelhub/'

next_page = ''
soup_list = []

last_page = False
while not last_page:
    print("Scraping: " base_url+next_page)
    driver.get(base_url+next_page)
    
    # Get scroll height
    last_height = driver.execute_script("return document.body.scrollHeight")
    
    end_of_page = False
    while not end_of_page:
        # Scroll down to bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

        # Wait to load page
        time.sleep(SCROLL_PAUSE_TIME)

        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            end_of_page = True            
            break
        last_height = new_height
        
    soup = BeautifulSoup(driver.page_source, "lxml")
    soup_list.append(soup)
    
    next_page = soup.find('a', {"data-track": "paginationRightClick", "rel": "next"})
    if next_page is not None:
        next_page = next_page['href'].rstrip('/').split('/')[-1]
    else:
        last_page=True

https://www.flickr.com/photos/sentinelhub/
https://www.flickr.com/photos/sentinelhub/page2
https://www.flickr.com/photos/sentinelhub/page3
https://www.flickr.com/photos/sentinelhub/page4


In [100]:
entry_ids = np.unique([int(entry['href'].rstrip('/').split('/')[-1]) for soup in soup_list for entry in soup.find_all('a', {"class": 'overlay'})])
df = pd.DataFrame(dict(entry_id = entry_ids))

df.to_csv('1_entry_ids.csv', index=False)