# Selenium web scraper using chromium
v0.4

In [None]:
import os
# Add the chromedriver to the path, by default in the current folder on the notebook in a subfolder webdriver, just place the binary inside
os.environ['PATH'] += ";%swebdriver" % (os.path.dirname(os.path.realpath("__file__")) + '\\')
os.environ['PATH']

In [None]:
# Parameters
cookies = [{'name': 'cookie1', 'value': 'value1', 'domain': 'website.com'},
          {'name': 'cookie2', 'value': 'value2', 'domain': 'website.com'}
          ]  # cookies to get authenticated, can also use password via selenium IDE but this adds more steps and is less secure
curpath = os.path.dirname(os.path.realpath("__file__"))
rootfolder = "%s/%s" % (curpath, 'downloaded')  # local base folder where to save to

In [32]:
from lxml import etree
from pathvalidate import sanitize_filename
from html2text import html2text
import time
import requests
import random
from tqdm.auto import tqdm

# Generated by Selenium IDE
import pytest
import time
import json
#from selenium import webdriver
from seleniumwire import webdriver  # this is NOT autogenerated, this allows to sniff media files
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities

class Scraper():
    def setup_method(self, method=None, cookies=None):
        self.driver = webdriver.Chrome()
        self.store = {}
        self.store['base'] = "https://www.website.com/"
        self.store['dummyurl'] = "https://www.website.com/error404"  # can also try to access an image instead
        self.store['dummyurl_redirect'] = "https://www.website.com/frontpage-after-login"  # url to access after, because some websites require users to access the frontpage before being able to access sublinks as a protection against bots
        self.store['realbase'] = "https://www.website.com/listing-of-things-to-download"
        self.vars = {}

    def preset_cookies(self, cookies=None):
        """Open a URL using the driver's base URL"""
        # Navigate with cookies, need to open a dummy url on the same domain and then only can we open the true url we want
        # From: https://stackoverflow.com/questions/36305660/selenium-js-add-cookie-to-request

        #t.driver.delete_all_cookies()
        #t.driver.get_cookies()

        # Navigate to a dummy url on the same domain.
        self.driver.get(self.store['dummyurl'])

        # Load cookies
        if cookies:
            for c in cookies:
                t.driver.add_cookie(c)

        # # url to access after and wait a bit, otherwise other requests will fail, because some websites require users to access the frontpage before being able to access sublinks as a protection against bots
        self.driver.get(self.store['dummyurl_redirect'])
        time.sleep(3)

    def count_links(self):
        # Go to posters listing and scrape links
        self.driver.get(self.store['realbase'])
        all_links = self.driver.find_elements(By.XPATH, "//ul[@id=\'agenda\']//a")  # need to fetch all links everytime, otherwise they will become detached
        # Count the number of links to make a loop
        self.vars['all_links_count'] = len(all_links)

    def scrape_all_abstracts(self, restart=None):
        # Main loop to scrape everything
        self.count_links()
        for abstract_id in tqdm(range(self.vars['all_links_count'])):
            if restart:
                if abstract_id < restart:
                    continue
            # Go to posters listing and scrape links
            self.driver.get(self.store['realbase'])
            all_links = self.driver.find_elements(By.XPATH, "//ul[@id=\'agenda\']//a")  # need to fetch all links everytime, otherwise they will become detached
            try:
                all_links[abstract_id].click()
            except:
                continue  # skip to the next link if this one has issues
            # Download files
            self.download_abstract()
            self.download_poster_mediafiles()
            # Wait a random time to avoid bot being detected
            random.uniform(1, 10)
        # Done
        return(1)

    def scrape_one_abstract(self, abstract_id):
        # Go to posters listing and scrape links
        self.driver.get(self.store['realbase'])
        all_links = self.driver.find_elements(By.XPATH, "//ul[@id=\'agenda\']//a")  # need to fetch all links everytime, otherwise they will become detached
        try:
            all_links[abstract_id].click()
        except:
            # skip to the next link if this one has issues
            abstract_id += 1
            all_links[abstract_id].click()
        # Download files
        self.download_abstract()
        self.download_poster_mediafiles()
        # Done
        return(1)

    def download_abstract(self):
        # Download the abstract as HTML
        # Wait a bit because otherwise the page may not have loaded yet
        time.sleep(2)  # TODO: for more robust methods, see https://stackoverflow.com/questions/5868439/wait-for-page-load-in-selenium and https://artoftesting.com/wait-for-page-to-load-selenium-webdriver-java
        # Get HTML source code
        poster_abstract = self.driver.page_source
        # Extract abstract title
        tree = etree.HTML(poster_abstract)
        r = tree.xpath('//h1')[0]
        poster_title = r.text
        # Create poster folder
        poster_folder = "%s/%s" % (rootfolder, sanitize_filename(poster_title))
        self.vars['poster_folder'] = poster_folder
        if not os.path.exists(poster_folder):
            os.makedirs(poster_folder)
        # Save HTML in the adequate folder
        with open("%s/abstract.html" % (poster_folder), "wb") as f:
            f.write(bytes(poster_abstract, encoding='utf-8'))
        # Also save the abstract as a text (markdown) file, converting and removing all superfluous HTML markups
        poster_abstract_body = tree.xpath('//div[@class=\'main-popup-content\']')
        poster_abstract_body_html = etree.tostring(poster_abstract_body[0], pretty_print=True)
        poster_abstract_body_text = html2text(str(poster_abstract_body_html)).replace('\\t', '').replace('\\n', '')[2:-1]
        with open("%s/abstract.md" % (poster_folder), "wb") as f:
            f.write(bytes(poster_abstract_body_text, encoding='utf-8'))
        # To download complete source code with CSS, JS etc:
        # https://stackoverflow.com/questions/42900214/how-to-download-a-html-webpage-using-selenium-with-python
        #poster_abstract  # debug
        tree.xpath('//h1')[0].text

    def download_poster_mediafiles(self):
        # TODO: In the future, use Selenium if there are securities: https://sqa.stackexchange.com/questions/2197/how-to-download-a-file-using-seleniums-webdriver
        # Clear previous requests, otherwise we will keep on redownloading the same stuff again and again
        del self.driver.requests
        # Access poster
        self.driver.find_element(By.CSS_SELECTOR, ".pull-right > .btn-lg").click()
        # Sniff media files (audio, poster)
        time.sleep(3)  # wait a bit for the poster to load, TODO: see driver.wait_for_request()
        # Wait for the request/response to complete
        self.driver.wait_for_request('4000px')
        mediafiles = set([])
        for request in self.driver.requests:
            if request.response:
                if '4000px.png' in request.url or request.response.headers['content-type'] == 'audio/mpeg':
                    mediafiles.add(request.url)
                    #print(
                    #    request.url,
                    #    request.response.status_code,
                    #    request.response.headers['Content-Type']
                    #)
        # Download media files
        poster_folder = self.vars['poster_folder']
        for file_url in mediafiles:
            file_dl = requests.get(file_url, allow_redirects=True)
            filename = file_url.rsplit('/', 1)[1]
            open('%s/%s' % (poster_folder, filename), 'wb').write(file_dl.content)
        
  
    def teardown_method(self, method=None):
        self.driver.quit()

t = Scraper()
t

<__main__.Scraper at 0x1eb2b314730>

In [33]:
t.setup_method()

In [34]:
t.preset_cookies(cookies=cookies)

In [35]:
if t.scrape_one_abstract(0) == 1:
    print('All done!')

All done!


In [40]:
# Now for the real download of all links
print('''CAUTION: to download ALL media files including audio and video,
      you need to keep the browser window NOT minimized (ie, in background it's fine, but the window must be open),
      otherwise the media won't play and thus won't be downloaded!''')
if t.scrape_all_abstracts(restart=856) == 1:
    print('All done!')

CAUTION: to download ALL media files including audio and video,
      you need to keep the browser window NOT minimized (ie, in background it's fine, but the window must be open),
      otherwise the media won't play and thus won't be downloaded!


HBox(children=(FloatProgress(value=0.0, max=1352.0), HTML(value='')))


All done!


------------------------------------

## Tests

In [None]:
import requests

In [None]:
cookies = [{'name': 'cookie1', 'value': 'value1', 'domain': 'website.com'},
          {'name': 'cookie2', 'value': 'value2', 'domain': 'website.com'}
          ]
url = "https://website.com/frontpage-after-login"

In [None]:
r = requests.get(url, cookies=cookies)

In [None]:
if r.status_code != 200:
    print('Failure to connect! Update cookies. Response code:')
    print(r.status_code)

r.text