In [1]:
import pandas as pd
import numpy as np
import time

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

In [141]:
class IFSCScraper():
    """
    Define a class for the scraper that will be used to gather data from the IFSC website
    (ifsc-climbing.org)
    Includes methods that allow for scraping different pages and different information
    """

    def __init__(self, debug=False):
        """
        Initialize a scraper object with its own browser instance
        Input:
            debug - Indicates whether this is a debug instance for quicker development
        """
        self.debug = debug

        try:
            self.browser = webdriver.Firefox()
        except:
            print('Error: Could not create WebDriver object...')

        time.sleep(1)
    
    def load_page(self, link, timeout=10, wait_after=5):
        """
        Helper function that loads a page and waits for timeout
        input:
            link - Link to the page we wish to load
            timeout - Seconds to wait before timing out
            wait_after - Seconds to wait after loading
        output:
            N/A
        """

        # Visit link
        self.browser.get(link)

        # Attempt to open link
        try:
            WebDriverWait(self.browser, timeout).until(EC.visibility_of_element_located((By.XPATH,
            "//div[@class='uk-container']")))
        except TimeoutException:
            print("Timed out waiting for page " + link + " to load")
            self.browser.quit()

        # Wait for page to load
        time.sleep(wait_after)
        
    def get_comp_links(self):
        """
        Parse the world-competition/last-result page to find and return comp names, dates, and links
        input:
            N/A
        output:
            List of touples containing comp names, dates, and url strings for each competition result page
        """

        # Page url
        url = 'https://www.ifsc-climbing.org/index.php/world-competition/last-result'

        try:
            self.load_page(url)
            
            # The dropdown menus to pick years/leagues/events is in an iframe - we need to switch to it
            frame = self.browser.find_element(By.XPATH, '/html/body/div[1]/div[4]/div/iframe')
            self.browser.switch_to.frame(frame)
            
            # Dropdown menus for each choice
            year_dd     = self.browser.find_element(By.XPATH, '//select[@id="years"]')
            league_dd   = self.browser.find_element(By.XPATH, '//select[@id="indexes"]')
            event_dd    = self.browser.find_element(By.XPATH, '//select[@id="events"]')
            category_dd = self.browser.find_element(By.XPATH, '//select[@id="categories"]')
            
            # Selenium Select class gets objects in dropdown and puts them in corresponding list
            years_ob      = Select(year_dd).select_by_index(1)     #0 is most recent year
            leagues_ob    = Select(league_dd).select_by_index(1)   #starts at index 1
            
            wait = WebDriverWait(self.browser, 5)
            wait.until(EC.presence_of_element_located((By.XPATH, "//select[@id='events']")))
            # wait.until(lambda x: x.find_element(By.XPATH, "//select[@id='events']/option[text()='IFSC*']"))
            time.sleep(3)
            
            year_opts   = Select(year_dd).options
            league_opts = Select(league_dd).options
            event_opts  = Select(event_dd).options
            cat_opts    = Select(category_dd).options
                        
            # Extracts the text from the objects and adds to list
            years      = [x.text for x in year_opts]
            leagues    = [x.text for x in league_opts]           
            events     = [x.text for x in event_opts]
            categories = [x.text for x in cat_opts]
            
            # print(len(year_opts))
            # print(len(league_opts))
            # print(len(events_opts))
            # print(len(cat_opts))
            print(years)
            print(leagues)
            print(events)
            print(categories)
        except:
            self.browser.quit()

        return year_opts, league_opts, event_opts, cat_opts
        # return [(year, league) for year, league in zip(years, leagues)]

In [142]:
scraper = IFSCScraper()
y, l, e, c = scraper.get_comp_links()

['2023', '2022', '2021', '2020', '2019', '2018', '2017', '2016', '2015', '2014', '2013', '2012', '2011', '2010', '2009', '2008', '2007', '2006', '2005', '2004', '2003', '2002', '2001', '2000', '1999', '1998', '1997', '1996', '1995', '1994', '1993', '1992', '1991', '1990']
['Select league', 'World Cups and World Championships', 'IFSC Youth', 'IFSC Asia Adults', 'IFSC Asia Youth', 'IFSC Europe Adults', 'IFSC Europe Youth', 'Games', 'IFSC Pan America Adults', 'IFSC Africa', 'IFSC Paraclimbing', 'IFSC Oceania', 'Other events', 'Masters and Promotional Events']
['Select event', 'IFSC - Climbing World Cup (B) - Meiringen (SUI) 2022', 'IFSC - Climbing World Cup (B,S) - Seoul (KOR) 2022', 'IFSC - Climbing World Cup (B,S) - Salt Lake City (USA) 2022', 'IFSC - Climbing World Cup (B,S) - Salt Lake City (USA) 2022', 'IFSC - Climbing World Cup (B) - Brixen (ITA) 2022', 'IFSC - Climbing World Cup (B,L) - Innsbruck (AUT) 2022', 'IFSC - Climbing World Cup (L,S) - Villars (SUI) 2022', 'IFSC - Climbing 