In [1]:
import pandas as pd
import numpy as np
import pprint
import time

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

from datetime import date

In [2]:
class IFSCScraper():
    """
    Define a class for the scraper that will be used to gather data from the IFSC website
    (ifsc-climbing.org)
    Includes methods that allow for scraping different pages and different information
    """
    # Page url
    url = 'https://www.ifsc-climbing.org/index.php/world-competition/last-result'
    
    def __init__(self, debug=False):
        """
        Initialize a scraper object with its own browser instance
        Input:
            debug - Indicates whether this is a debug instance for quicker development
        """
        self.debug = debug
        self.year_league_comb = []
        self.year_events = {}

        try:
            self.browser = webdriver.Firefox()
        except:
            print('Error: Could not create WebDriver object...')

        time.sleep(1)
    
    def load_page(self, link, timeout=10, wait_after=5):
        """
        Helper function that loads a page and waits for timeout
        input:
            link - Link to the page we wish to load
            timeout - Seconds to wait before timing out
            wait_after - Seconds to wait after loading
        output:
            N/A
        """

        # Visit link
        self.browser.get(link)
        self.browser.implicitly_wait(3)

        # Attempt to open link
        try:
            WebDriverWait(self.browser, timeout).until(EC.visibility_of_element_located((By.XPATH,
            "//div[@class='uk-container']")))
        except TimeoutException:
            print("Timed out waiting for page " + link + " to load")
            self.browser.quit()

        # Wait for page to load
        time.sleep(wait_after)
        
    def get_comp_years_and_league(self):
        """
        Parse the world-competition/last-result page to find and return years and leagues
        input:
            N/A
        output:
            List of touples containing comp year and league
        """

        try:
            self.load_page(IFSCScraper.url)
            
            # The dropdown menus to pick years/leagues/events is in an iframe - we need to switch to it
            frame = self.browser.find_element(By.XPATH, '/html/body/div[1]/div[4]/div/iframe')
            self.browser.switch_to.frame(frame)
            
            # Dropdown menus for each choice
            year_dd     = self.browser.find_element(By.XPATH, '//select[@id="years"]')
            league_dd   = self.browser.find_element(By.XPATH, '//select[@id="indexes"]')
            
            # Select all options for 'Year' and 'League' dropdown menus
            year_opts = Select(year_dd).options
            league_opts = Select(league_dd).options
            
            # Extract text of each of the above options
            years   = [opt.text for opt in year_opts]
            # leagues = [opt.text for opt in league_opts[1:]] # all leagues
            league = league_opts[1].text #world cup only for now
                              
            for year in years:
                if (year, league) not in self.year_league_comb:
                    self.year_league_comb.append((year, league))
                                                
#             # Selenium Select class gets objects in dropdown and puts them in corresponding list
#             years_ob   = Select(year_dd).select_by_index(1)     #0 is most recent year
#             leagues_ob = Select(league_dd).select_by_index(1)   #starts at index 1
            
#             # Waits for third dropdown to populate with options based on leagues_ob
#             wait = WebDriverWait(self.browser, 10)            
#             wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "#events > option:nth-child(2)")))
        
#             # Selects third dropdown menu, events
#             event_dd = self.browser.find_element(By.XPATH, '//select[@id="events"]')
            
#             # Wait again for fourth dropdown menu to populate with options based on event_dd            
#             wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "#categories > option:nth-child(2)")))
#             category_dd = self.browser.find_element(By.XPATH, '//select[@id="categories"]')
            
#             # Pick a category to view full results on same page
#             # Select(category_dd).select_by_index(1) #mens
            
#             # Gets options for all four dropdown menus            
#             year_opts   = Select(year_dd).options
#             league_opts = Select(league_dd).options
#             event_opts  = Select(event_dd).options
#             cat_opts    = Select(category_dd).options
                        
#             # Extracts the text from the objects and adds to list
#             years      = [x.text for x in year_opts]
#             leagues    = [x.text for x in league_opts[1:]]           
#             events     = [x.text for x in event_opts[1:]]
#             categories = [x.text for x in cat_opts[1:]]
            
#             event_ids = [opt.get_attribute("value").split('/')[-1] for opt in event_opts[1:]]
        except Exception as ex:
            print(ex)

    def get_years_events(self):
        """
        Iterate through each year and get that years events
        input:
            N/A
        output:
            List of touples containing comp year and league
        """
        self.load_page(IFSCScraper.url)
        
        # The dropdown menus to pick years/leagues/events is in an iframe - we need to switch to it
        frame = self.browser.find_element(By.XPATH, '/html/body/div[1]/div[4]/div/iframe')
        self.browser.switch_to.frame(frame)
        
        # Dropdown menus for each choice
        year_dd     = self.browser.find_element(By.XPATH, '//select[@id="years"]')
        league_dd   = self.browser.find_element(By.XPATH, '//select[@id="indexes"]')
        
        # Iterate through years provided by get_comp_years_and_league
        for year in self.year_league_comb:                
            years_ob   = Select(year_dd).select_by_value(year[0])   #0 is most recent year (2023)
            leagues_ob = Select(league_dd).select_by_index(1) #starts at index 1
        
            # Selects third dropdown menu, events
            event_dd = self.browser.find_element(By.XPATH, '//select[@id="events"]')
            
            # Waits for third dropdown to populate with options based on leagues_ob
            wait = WebDriverWait(self.browser, 10)            
            wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "#events > option:nth-child(2)")))
            
            # Gets event options for year and add to dictionary
            event_opts = Select(event_dd).options
            self.year_events[year[0]] = [opt.text for opt in event_opts[1:]]

        # # Wait again for fourth dropdown menu to populate with options based on event_dd            
        # wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "#categories > option:nth-child(2)")))
        # category_dd = self.browser.find_element(By.XPATH, '//select[@id="categories"]')
    
    def check_for_results(self, year, league):
        pass
    
def display_events(dict):
        for k in dict:
            print(f'{int(k)}:')
            for v in dict[k]:
                print('    ', v)

In [3]:
scraper = IFSCScraper()
scraper.get_comp_years_and_league()
scraper.get_years_events()

In [6]:
# display_events(scraper.year_events)