In [1]:
import pandas as pd
import numpy as np
import pprint
import time

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

from datetime import date

In [2]:
class IFSCScraper():
    """
    Define a class for the scraper that will be used to gather data from the IFSC website
    (ifsc-climbing.org)
    Includes methods that allow for scraping different pages and different information
    """
    # Page url
    url = 'https://www.ifsc-climbing.org/index.php/world-competition/last-result'
    
    def __init__(self, debug=False):
        """
        Initialize a scraper object with its own browser instance
        Input:
            debug - Indicates whether this is a debug instance for quicker development
        """
        self.debug = debug
        self.year_events = {}

        self.generate_driver()
        time.sleep(1)
    
    def generate_driver(self):
        """
        Initialize Selenium web browser
        Input:
            N/A
        """
        try:
            self.driver = webdriver.Firefox()
        except:
            print('Error: Could not create WebDriver object...')
        
    def load_page(self, link, timeout=10, wait_after=5):
        """
        Helper function that loads a page and waits for timeout
        input:
            link - Link to the page we wish to load
            timeout - Seconds to wait before timing out
            wait_after - Seconds to wait after loading
        output:
            N/A
        """

        # Visit link
        self.driver.get(link)

        # Attempt to open link
        try:
            WebDriverWait(self.driver, timeout).until(EC.visibility_of_element_located((By.XPATH,
            "//div[@class='uk-container']")))
        except TimeoutException:
            print("Timed out waiting for page " + link + " to load")
            self.driver.quit()

        # Wait for page to load
        time.sleep(wait_after)
        
    def get_comp_years_and_league(self):
        """
        Parse the world-competition/last-result page to find and return years and leagues
        input:
            N/A
        output:
            List of touples containing comp year and league
        """
        year_league_comb = []
        
        try:
            self.load_page(IFSCScraper.url)
        except:
            print('Error loading page!')
            self.driver.quit()
            
        # The dropdown menus to pick years/leagues/events is in an iframe - we need to switch to it
        frame = self.driver.find_element(By.XPATH, '/html/body/div[1]/div[4]/div/iframe')
        self.driver.switch_to.frame(frame)

        # Dropdown menus for each choice
        year_dd     = self.driver.find_element(By.XPATH, '//select[@id="years"]')
        league_dd   = self.driver.find_element(By.XPATH, '//select[@id="indexes"]')

        # Select all options for 'Year' and 'League' dropdown menus
        year_opts = Select(year_dd).options
        league_opts = Select(league_dd).options

        # Extract text of each of the above options
        years   = [opt.text for opt in year_opts]
        # leagues = [opt.text for opt in league_opts[1:]] # all leagues
        league = league_opts[1].text #world cup only for now

        year_league_comb = []
        for year in years:
            if (year, league) not in year_league_comb:
                year_league_comb.append((year, league))

#             # Wait again for fourth dropdown menu to populate with options based on event_dd            
#             wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "#categories > option:nth-child(2)")))
#             category_dd = self.browser.find_element(By.XPATH, '//select[@id="categories"]')

#             # Pick a category to view full results on same page
#             # Select(category_dd).select_by_index(1) #mens

#             # Gets options for all four dropdown menus
#             cat_opts    = Select(category_dd).options

#             # Extracts the text from the objects and adds to list
#             categories = [x.text for x in cat_opts[1:]]
        return year_league_comb

    def get_years_events(self, years):
        """
        Iterate through each year and get that years events
        input:
            N/A
        output:
            List of tuples containing comp year and league
        """
        try:
            self.load_page(IFSCScraper.url)
        except:
            print('Error loading page!')
        
        # The dropdown menus to pick years/leagues/events is in an iframe - we need to switch to it
        frame = self.driver.find_element(By.XPATH, '/html/body/div[1]/div[4]/div/iframe')
        self.driver.switch_to.frame(frame)

        # Dropdown menus for each choice
        year_dd     = self.driver.find_element(By.XPATH, '//select[@id="years"]')
        league_dd   = self.driver.find_element(By.XPATH, '//select[@id="indexes"]')
                
        # Creating wait
        wait = WebDriverWait(self.driver, 10)
        
        # Iterate through years provided by get_comp_years_and_league
        for year in years:                
            years_ob   = Select(year_dd).select_by_value(year[0])   #0 is most recent year (2023)
            leagues_ob = Select(league_dd).select_by_index(1) #starts at index 1
        
            # Selects third dropdown menu, events
            event_dd = self.driver.find_element(By.XPATH, '//select[@id="events"]')
            
            # Waits for third dropdown to populate with options based on leagues_ob                                        
            events_select = Select(event_dd)
            wait.until(lambda d: len(events_select.options) > 1)
            
            # Gets event options and ids for year and add to dictionary
            event_opts = Select(event_dd).options
            events = [opt.text for opt in event_opts[1:]]
            event_ids = [opt.get_attribute("value").split('/')[-1] for opt in event_opts[1:]]
            self.year_events[year[0]] = [(event, id) for event, id in zip(events, event_ids)]
            
            # check_for_event_results(self.driver, event_dd, events)
            print(f'{year[0]}: {self.check_for_event_results(self.driver, event_dd, events)}')
                            
        # # Wait again for fourth dropdown menu to populate with options based on event_dd            
        # wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "#categories > option:nth-child(2)")))
        # category_dd = self.browser.find_element(By.XPATH, '//select[@id="categories"]')
    
    def check_for_event_results(self, driver, dd, events):
        """
        Iterate through each event and check if results exist
        input:
            N/A
        output:
            unsure yet
        """
        event_has_results = []
        for event in events:
            # Select event in events dropdown
            events_ob = Select(dd).select_by_visible_text(event)

            # Select fourth dropdown menu, categories
            cat_dd = driver.find_element(By.XPATH, '//select[@id="categories"]')
            cat_select = Select(cat_dd)
            wait = WebDriverWait(driver, 2)
            try:
                wait.until(lambda d: len(cat_select.options) > 1)
                event_has_results.append(1)
            except:
                event_has_results.append(0)
        return f'{sum(event_has_results)} of {len(events)} events have results!'
    
    def end_session(self):
        self.driver.quit()
    
    def scrape_site(self):
        self.get_years_events(self.get_comp_years_and_league())
        self.end_session()
        
def display_events(dict):
        for k in dict:
            print(f'{int(k)}:')
            for v in dict[k]:
                print('    ', v)

In [3]:
scraper = IFSCScraper()
scraper.scrape_site()

2023: 0 of 13 events have results!
2022: 13 of 13 events have results!
2021: 9 of 10 events have results!
2020: 5 of 5 events have results!
2019: 14 of 14 events have results!
2018: 16 of 16 events have results!
2017: 15 of 15 events have results!
2016: 17 of 17 events have results!
2015: 13 of 13 events have results!
2014: 18 of 18 events have results!
2013: 19 of 19 events have results!
2012: 20 of 20 events have results!
2011: 22 of 23 events have results!
2010: 15 of 15 events have results!
2009: 15 of 16 events have results!
2008: 15 of 15 events have results!
2007: 19 of 19 events have results!
2006: 18 of 18 events have results!
2005: 15 of 16 events have results!
2004: 16 of 16 events have results!
2003: 19 of 19 events have results!
2002: 13 of 13 events have results!
2001: 10 of 10 events have results!
2000: 11 of 11 events have results!
1999: 12 of 12 events have results!
1998: 4 of 4 events have results!
1997: 6 of 6 events have results!
1996: 3 of 3 events have results!
19

In [5]:
display_events(scraper.year_events)

2023:
     ('IFSC - Climbing World Cup (B) - Hachioji (JPN) 2023', '1291')
     ('IFSC - Climbing World Cup (B,S) - Seoul (KOR) 2023', '1292')
     ('IFSC - Climbing World Cup (S) - Jakarta (INA) 2023', '1293')
     ('IFSC - Climbing World Cup (B,S) - Salt Lake City (USA) 2023', '1294')
     ('IFSC - Climbing World Cup (B) - Prague (CZE) 2023', '1295')
     ('IFSC - Climbing World Cup (B) - Brixen (ITA) 2023', '1296')
     ('IFSC - Climbing World Cup (B,L) - Innsbruck (AUT) 2023', '1297')
     ('IFSC - Climbing World Cup (L,S) - Villars (SUI) 2023', '1298')
     ('IFSC - Climbing World Cup (L,S) - Chamonix (FRA) 2023', '1299')
     ('IFSC - Climbing World Cup (L) - Briançon (FRA) 2023', '1300')
     ('IFSC - Climbing World Championships (B,L,S,B&L) - Bern (SUI) 2023', '1301')
     ('IFSC - Climbing World Cup (L) - Koper (SLO) 2023', '1302')
     ('IFSC - Climbing World Cup (L,S) - Wujiang (CHN) 2023', '1303')
2022:
     ('IFSC - Climbing World Cup (B) - Meiringen (SUI) 2022', '1233')
 