In [329]:
from selenium import webdriver
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException

from bs4 import BeautifulSoup

import time
import numpy as np
import pandas as pd
import re

from mistletoe import markdown
from html2text import HTML2Text

from tqdm.auto import tqdm

def html2md(html):
    parser = HTML2Text()
    parser.ignore_images = True
    parser.ignore_anchors = True
    parser.body_width = 0
    md = parser.handle(html)
    return md


# configure Firefox Driver
def configure_firefox_driver():
    # Add additional Options to the webdriver
    firefox_options = FirefoxOptions()
    # add the argument and make the browser Headless.
    firefox_options.add_argument("--headless")

    # Instantiate the Webdriver: Mention the executable path of the webdriver you have downloaded
    # if driver is in PATH, no need to provide executable_path
    driver = webdriver.Firefox(executable_path = "./geckodriver", options = firefox_options)
    return driver

def get_loaded_page(driver, url, pause):
    
    driver.get(url)
    end_of_page = False
    last_height = driver.execute_script("return document.body.scrollHeight")

    while not end_of_page:
        # Scroll down to bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

        # Wait to load page
        time.sleep(pause)

        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            end_of_page = True            
            break
        last_height = new_height
        
    soup = BeautifulSoup(driver.page_source, "lxml")
    return soup

In [330]:
driver = configure_firefox_driver()

In [331]:
entry_ids = pd.read_csv('1_entry_ids.csv').entry_id.values

In [359]:
def get_page_content(entry_id):
    # get soup
    base_url = 'https://www.flickr.com/photos/sentinelhub/'
    url = base_url+str(entry_id)
    soup = get_loaded_page(driver, url, 1)

    # set basic vars
    title = soup.find('h1', {'class': "meta-field photo-title"}).get_text()
    description = soup.find('meta', {"name": "description"})['content']
    view_count = int(soup.find('span', {'class':"view-count-label"}).get_text().strip().replace(',',''))
    comment_count = int(soup.find('span', {'class':"comment-count-label"}).get_text().strip().replace(',',''))
    fave_count = int(soup.find('span', {'class':"fave-count-label"}).get_text().strip().replace(',',''))
    
    print(description)
    full_description = html2md(description)[:-1]
    
    try:
        location = soup.find('a', {'class':"location-name-link"}).get_text().strip()
    except:
        location = None

    # parse comments as markdown
    comments = soup.find_all('div', {'class': "comment-content"})
    comments = [html2md(''.join([str(c) for c in comment.contents]))[:-1] for comment in comments]

    # set re compilers
    author_re = re.compile(r'Author: ([a-zA-ZćčšžđĆČŠŽĐ ]+)\n')
    style_re = re.compile(r'[0-9]+-[0-9]+-[0-9]+\n([a-zA-Z0-9-+ ]+)')
    date_re = re.compile(r'Date: ([0-9]+-[0-9]+-[0-9]+)\n')
    time_range_re = re.compile(r'Time Range: ([0-9]+-[0-9]+-[0-9]+) - ([0-9]+-[0-9]+-[0-9]+)\n')
    eobrowser_re = re.compile(r'\[Inspect in EO Browser\]\(([a-zA-Z0-9:/.-]+)\)')

    # try extraction
    try:
        author = author_re.findall(description)[0]
    except:
        author = None

    try:
        style = style_re.findall(description)[0]
    except:
        style = None
        
    try:
        dates = [date_re.findall(description)[0]]
    except:
        try:
            dates = list(time_range_re.findall(description)[0])
        except:
            dates = []
            
    try:
        eobrowser_link = eobrowser_re.findall(full_description)[0]
    except:
        eobrowser_link = None
        

    # next soup of sizes
    url += '/sizes/o'
    soup = get_loaded_page(driver, url, 0.5)

    # size extraction re compilers
    size_info = soup.find_all('dd')[2]
    size_re = re.compile(r'([a-zA-Z0-9 ]+)\n\(([0-9]+) × ([0-9]+)\)')

    # size extraction
    sizes = size_re.findall(size_info.get_text().replace('\t',''))
    image_links = ['https://www.flickr.com'+tag['href'] for tag in size_info.find_all('a')]
    image_link = image_links[-1]
    
    # download link
    soup = get_loaded_page(driver, image_link, 0.5)
    dl_link = soup.find_all('dd')[1].find('a')['href']

    # set final content
    values = dict(
        entry_id=entry_id,
        title = title,
        page_url = base_url+str(entry_id),
        author = author,
        style = style,
        dates = dates,
        full_description=full_description,
        eobrowser_link=eobrowser_link,
        view_count = view_count,
        comment_count = comment_count,
        fave_count = fave_count,
        location=location,
        comments=comments,
        original_size=f'[{"x".join(sizes[0][1:])}]',
        image_size=f'[{"x".join(sizes[-1][1:])}]',
        image_link=image_links[-1],
        dl_link=dl_link
    )

    return values

In [347]:
values_list = []
for entry_id in tqdm(entry_ids):
    try:
        values_list.append(get_page_content(entry_id))
    except:
        pass    

  0%|          | 0/371 [00:00<?, ?it/s]

In [348]:
result = pd.DataFrame(columns=list(values_list[0].keys())).append(values_list, ignore_index=True)
result.to_pickle('./2_post_content.pkl')

In [362]:
result

Unnamed: 0,entry_id,title,page_url,author,style,dates,full_description,eobrowser_link,view_count,comment_count,fave_count,location,comments,original_size,image_size,image_link,dl_link
0,31927027927,River Salso sediments,https://www.flickr.com/photos/sentinelhub/3192...,,,[],Contains modified Copernicus Sentinel data [20...,,1208,0,2,"Licata, Licata, Sicily",[],[1834x1020],[1834x1020],https://www.flickr.com/photos/sentinelhub/3192...,https://live.staticflickr.com/7927/31927027927...
1,31984210167,Madagascar’s Silting,https://www.flickr.com/photos/sentinelhub/3198...,,URL,[2017-07-18],"Bombetoka bay, Madagascar, false color (8,4,3)...",,828,0,2,"Boanamary, Boanamary, Boeny",[],[3669x1768],[3669x1768],https://www.flickr.com/photos/sentinelhub/3198...,https://live.staticflickr.com/7824/31984210167...
2,31984210297,Vietnam Flood,https://www.flickr.com/photos/sentinelhub/3198...,,URL,[2017-02-07],"Nha Trang, Vietnam, false color (11,8,2) Date:...",,703,0,0,"Xã Diên Phú, Tỉnh Khánh Hòa, Vietnam",[],[3669x1768],[3669x1768],https://www.flickr.com/photos/sentinelhub/3198...,https://live.staticflickr.com/4913/31984210297...
3,31984918877,"Musa Bay, Iran",https://www.flickr.com/photos/sentinelhub/3198...,,URL,[2018-12-29],"Bandar-e Emam Khomeyni, Iran, true color Date:...",,664,0,0,"Bandar Šâhpur, Khuzestan, Iran",[],[3623x1770],[3623x1770],https://www.flickr.com/photos/sentinelhub/3198...,https://live.staticflickr.com/7809/31984918877...
4,31985568217,"Chachani, Peru",https://www.flickr.com/photos/sentinelhub/3198...,,URL,[2018-08-23],"Chachani, Peru, false color (12,11,4) Date: 20...",,664,0,0,"Charcani Grande, Cayma, Arequipa",[],[3623x1676],[3623x1676],https://www.flickr.com/photos/sentinelhub/3198...,https://live.staticflickr.com/4881/31985568217...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
366,51004538951,Landscape of Western Australia,https://www.flickr.com/photos/sentinelhub/5100...,Monja Šebela,Sentinel-1 GRD + Modified,[2021-02-28],Landscape of Western Australia Date: 2021-02-2...,,543,0,0,"Bangemall, West Coast, Australia",[],[10920x6498],[6144x3656],https://www.flickr.com/photos/sentinelhub/5100...,https://live.staticflickr.com/65535/5100453895...
367,51004600502,"Lagos, Nigeria",https://www.flickr.com/photos/sentinelhub/5100...,Monja Šebela,Sentinel-1 GRD +,[2021-03-02],"Lagos, Nigeria Date: 2021-03-02 Sentinel-1 GRD...",,322,0,0,"Ogogoro, Lagos, Naija",[],[9408x6497],[6144x4243],https://www.flickr.com/photos/sentinelhub/5100...,https://live.staticflickr.com/65535/5100460050...
368,51004622497,Landscape of Western Australia,https://www.flickr.com/photos/sentinelhub/5100...,Monja Šebela,Sentinel-1 GRD +,[2021-02-21],Landscape of Western Australia Date: 2021-02-2...,,261,0,0,"Gascoyne Junction, West Coast, Australia",[],[10824x6497],[6144x3688],https://www.flickr.com/photos/sentinelhub/5100...,https://live.staticflickr.com/65535/5100462249...
369,51102722194,Farasan Islands,https://www.flickr.com/photos/sentinelhub/5110...,Monja Šebela,Sentinel-2 L2A + True Color Composite,[2019-04-29],Date: 2019-04-29 Sentinel-2 L2A + True Color C...,https://sentinelshare.page.link/4EwL,101,0,0,Saudi Arabia,[],[8976x6186],[6144x4234],https://www.flickr.com/photos/sentinelhub/5110...,https://live.staticflickr.com/65535/5110272219...
