## Import Libraries & Tools

In [1]:
import pandas as pd
import numpy as np
import pprint
import time
import csv
import re
import os

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

from datetime import date

## Create Directory Structure

In [2]:
DATA_DIR = os.path.join(
    os.path.dirname(os.path.realpath("__file__")), "data"
)

BOULDER_MEN_DIR            = os.path.join(DATA_DIR, "Boulder/Men")
BOULDER_WOMEN_DIR          = os.path.join(DATA_DIR, "Boulder/Women")
LEAD_MEN_DIR               = os.path.join(DATA_DIR, "Lead/Men")
LEAD_WOMEN_DIR             = os.path.join(DATA_DIR, "Lead/Women")
SPEED_MEN_DIR              = os.path.join(DATA_DIR, "Speed/Men")
SPEED_WOMEN_DIR            = os.path.join(DATA_DIR, "Speed/Women")
COMBINED_MEN_DIR           = os.path.join(DATA_DIR, "Combined/Men")
COMBINED_WOMEN_DIR         = os.path.join(DATA_DIR, "Combined/Women")
BOULDER_AND_LEAD_MEN_DIR   = os.path.join(DATA_DIR, "Boulder & Lead/Men")
BOULDER_AND_LEAD_WOMEN_DIR = os.path.join(DATA_DIR, "Boulder & Lead/Women")

dirs = [BOULDER_MEN_DIR, BOULDER_WOMEN_DIR, LEAD_MEN_DIR, LEAD_WOMEN_DIR,
       SPEED_MEN_DIR, SPEED_WOMEN_DIR, COMBINED_MEN_DIR, COMBINED_WOMEN_DIR,
       BOULDER_AND_LEAD_MEN_DIR, BOULDER_AND_LEAD_WOMEN_DIR]

# Create directory if it doesn't exist
for dir in dirs:
    if not os.path.exists(dir):
        os.makedirs(dir)
        
# File to store names of events that have already been scraped
try:
    ALREADY_SCRAPED = os.path.join(DATA_DIR, "scraped_events.txt")
    # create file
    with open(ALREADY_SCRAPED, 'x') as fp:
        pass
except:
    if os.stat(ALREADY_SCRAPED).st_size == 0:
        print("No data has been scraped yet!")

In [3]:
# # Create paths for .csv files
# DATA_DIR = os.path.join(
#     os.path.dirname(os.path.realpath("__file__")), "data"
# )

# # Individual directories for each competition category
# BOULDER_DIR      = os.path.join(DATA_DIR, "Boulder")
# LEAD_DIR         = os.path.join(DATA_DIR, "Lead")
# SPEED_DIR        = os.path.join(DATA_DIR, "Speed")
# COMBINED_DIR     = os.path.join(DATA_DIR, "Combined")
# BOULDER_AND_LEAD = os.path.join(DATA_DIR, "Boulder & Lead")
# dirs = [DATA_DIR, BOULDER_DIR, LEAD_DIR, SPEED_DIR, COMBINED_DIR, BOULDER_AND_LEAD]

# # Create directory if it doesn't exist
# for dir in dirs:
#     if not os.path.exists(dir):
#         os.makedirs(dir)
        
# # File to store names of events that have already been scraped
# try:
#     ALREADY_SCRAPED = os.path.join(DATA_DIR, "scraped_events.txt")
#     # create file
#     with open(ALREADY_SCRAPED, 'x') as fp:
#         pass
# except:
#     if os.stat(ALREADY_SCRAPED).st_size == 0:
#         print("No data has been scraped yet!")

## IFSCScraper Class Definition

In [4]:
class IFSCScraper():
    """
    Define a class for the scraper that will be used to gather data from the IFSC website
    (ifsc-climbing.org)
    Includes methods that allow for scraping different pages and different information
    """
    # Page url
    url = 'https://www.ifsc-climbing.org/index.php/world-competition/last-result'
    
    def __init__(self, debug=False):
        """
        Initialize a scraper object with its own browser instance
        Input:
            debug - Indicates whether this is a debug instance for quicker development
        """
        self.debug = debug
        self.year_events = {}

        self.generate_driver()            
        time.sleep(1)
    
    def generate_driver(self):
        """
        Initialize Selenium web browser
        Input:
            N/A
        """
        try:
            self.driver = webdriver.Firefox()
        except:
            print('Error: Could not create WebDriver object...')
        
    def load_page(self, link, timeout=10, wait_after=5):
        """
        Helper function that loads a page and waits for timeout
        input:
            link - Link to the page we wish to load
            timeout - Seconds to wait before timing out
            wait_after - Seconds to wait after loading
        output:
            N/A
        """

        # Visit link
        self.driver.get(link)

        # Attempt to open link
        try:
            WebDriverWait(self.driver, timeout).until(EC.visibility_of_element_located((By.XPATH,
            "//div[@class='uk-container']")))
        except TimeoutException:
            print("Timed out waiting for page " + link + " to load")
            self.driver.quit()

        # Wait for page to load
        time.sleep(wait_after)
        
    def get_comp_years_and_league(self):
        """
        Parse the world-competition/last-result page to find and return years and leagues
        input:
            N/A
        output:
            List of tuples containing comp year and league
        """
        year_league_comb = []
        
        try:
            self.load_page(IFSCScraper.url)
        except:
            print('Error loading page!')
            self.driver.quit()
            
        # The dropdown menus to pick years/leagues/events is in an iframe - we need to switch to it
        frame = self.driver.find_element(By.XPATH, '/html/body/div[1]/div[4]/div/iframe')
        self.driver.switch_to.frame(frame)

        # Dropdown menus for each choice
        year_dd     = self.driver.find_element(By.XPATH, '//select[@id="years"]')
        league_dd   = self.driver.find_element(By.XPATH, '//select[@id="indexes"]')

        # Select all options for 'Year' and 'League' dropdown menus
        year_opts = Select(year_dd).options
        league_opts = Select(league_dd).options

        # Extract text of each of the above options
        years   = [opt.text for opt in year_opts]
        # leagues = [opt.text for opt in league_opts[1:]] # all leagues
        league = league_opts[1].text #world cup only for now

        year_league_comb = []
        for year in years:
            if (year, league) not in year_league_comb:
                year_league_comb.append((year, league))

        return year_league_comb

    def get_years_events(self, years):
        """
        Iterate through each year and get that years events
        input:
            N/A
        output:
            List of tuples containing comp year and league
        """
        try:
            self.load_page(IFSCScraper.url)
        except:
            print('Error loading page!')
        
        # The dropdown menus to pick years/leagues/events is in an iframe - we need to switch to it
        frame = self.driver.find_element(By.XPATH, '/html/body/div[1]/div[4]/div/iframe')
        self.driver.switch_to.frame(frame)

        # Dropdown menus for each choice
        year_dd, league_dd, event_dd, cat_dd = self.get_dropdowns(self.driver)
                
        # Creating wait
        wait = WebDriverWait(self.driver, 10)
        
        # Iterate through years provided by get_comp_years_and_league
        for year in years:                
            years_ob   = Select(year_dd).select_by_value(year[0])   #0 is most recent year (2023)
            leagues_ob = Select(league_dd).select_by_index(1) #starts at index 1
        
            # IF THINGS BREAK, UNCOMMENT THIS SECTION
            # Selects third dropdown menu, events
            # event_dd = self.driver.find_element(By.XPATH, '//select[@id="events"]')
            
            # Waits for third dropdown to populate with options based on leagues_ob                                        
            events_select = Select(event_dd)
            wait.until(lambda d: len(events_select.options) > 1)
            
            # Gets event options and ids for year and add to dictionary
            event_opts = Select(event_dd).options
            events = [opt.text for opt in event_opts[1:]]
            event_ids = [opt.get_attribute("value").split('/')[-1] for opt in event_opts[1:]]
            self.year_events[year[0]] = [(event, id) for event, id in zip(events, event_ids)]
            
            print(f'{year[0]}: {self.check_for_event_results(self.driver, event_dd, events)}')
    
    def check_for_event_results(self, driver, dd, events):
        """
        Iterate through each event and check if results exist
        input:
            N/A
        output:
            unsure yet
        """
        event_has_results = []
        for event in events:
            # Select event in events dropdown
            events_ob = Select(dd).select_by_visible_text(event)

            # Select fourth dropdown menu, categories
            cat_dd = driver.find_element(By.XPATH, '//select[@id="categories"]')
            cat_select = Select(cat_dd)
            wait = WebDriverWait(driver, 2)
            try:
                wait.until(lambda d: len(cat_select.options) > 1)
                event_has_results.append(1)
            except:
                event_has_results.append(0)
        return f'{sum(event_has_results)} of {len(events)} events have results!'
    
    def get_year_list(self):
        try:
            self.load_page(IFSCScraper.url)            
            wait = WebDriverWait(self.driver, 5)
            
            # The data we are after resides within an iframe
            wait.until(EC.frame_to_be_available_and_switch_to_it((By.CSS_SELECTOR, "iframe.jch-lazyloaded")))
        except:
            print('Error loading page!')
            
        # Dropdown menus for each choice
        year_dd, league_dd, event_dd, cat_dd = self.get_dropdowns(self.driver)
        year_opts = Select(year_dd).options
        return [year.text for year in year_opts]
    
    def get_single_year(self, year = '2022'):
        """
        Fully scrape each event for a given year
        input:
            year (string) - year to be scraped
        output:
            List of tuples (category, title, date, dataframe) to 
            be passed into function that generates actual .csv file
        """
        try:
            self.load_page(IFSCScraper.url)            
            wait = WebDriverWait(self.driver, 5)
            
            # The data we are after resides within an iframe
            wait.until(EC.frame_to_be_available_and_switch_to_it((By.CSS_SELECTOR, "iframe.jch-lazyloaded")))
            print(f'Scraping {year}...')
        except:
            print('Error loading page!')        
        
        # Dropdown menus for each choice
        year_dd, league_dd, event_dd, cat_dd = self.get_dropdowns(self.driver)
        
        # Select given year and league
        year_ob   = Select(year_dd).select_by_visible_text(year)
        league_ob = Select(league_dd).select_by_index(1)
        
        # Get list of all events for the year
        all_events = self.get_events(self.driver, event_dd)
        
        # Loop through each event, scrape results, and generate .csv file
        dfs = []
        for i, event in enumerate(all_events):
            # Implement check to see if event has already been scraped
            if self.check_if_scraped(event) and all_events[i] != all_events[i-1]:
                print(f'Already scraped {event}!')
                continue
            else:
                # Set this flag for special cases where two events share the same name
                same_event_name = True if all_events[i] == all_events[i-1] else False
                                    
                # Select event
                if same_event_name:
                    event_ob = Select(event_dd).select_by_index(i+1)
                else:
                    event_ob = Select(event_dd).select_by_visible_text(event)
                    
                category_select = Select(cat_dd)

                # Some events were cancelled or don't have results listed, check for it here
                try:
                    wait.until(lambda d: len(category_select.options) > 1)
                    if not self.check_if_scraped(event):
                        self.add_to_scraped_file(event)
                        print(f'--Scraping {event}...')
                except:
                    print(f'No data for {event}!')
                    continue

                # Get results for each category
                for cat in category_select.options[1:]:
                    cat_ob = Select(cat_dd).select_by_visible_text(cat.text) # selects category

                    # Finds table with desired data
                    try:
                        wait.until(EC.visibility_of_element_located((By.XPATH, '//div[@id="table_id_wrapper"]')))
                    except:
                        print(f'No data for {cat.text}!')
                        continue
                        
                    table_wrapper = self.driver.find_element(By.XPATH, '//div[@id="table_id_wrapper"]')
                    results = table_wrapper.find_element(By.TAG_NAME, 'tbody').find_elements(By.TAG_NAME, 'tr')

                    # Get event name and date
                    event_details = self.driver.find_element(By.XPATH, '//div[@class="labels"]')
                    event_results = event_details.find_elements(By.TAG_NAME, 'p') # Event title & date

                    # Get filename to check if it exists already
                    file = self.generate_filename((cat.text, event_results[0].text, event_results[1].text))
                    text = '--' + file
                    path = self.get_dir(cat.text)
                    
                    if same_event_name:
                        text = text[:-4] + '2' + '.csv'
                        file = file[:-4] + '2' + '.csv'
                    
                    filepath = os.path.join(path, file)
                                        
                    # Checks if the filename has been added to the .txt, AND if the file exists
                    if self.check_if_scraped(text, filepath):
                        continue
                    else:
                        print(f'----Scraping {cat.text}...')
                        
                        # Data (list of dictionaries) contains each climber's results
                        data = []
                        for result in results:
                            # Each climber's result stored in dict
                            temp_dict = self.scrape_results(result, cat.text)
                            
                            if temp_dict:
                                data.append(temp_dict)
                            else:
                                print(f'No data for {cat.text}!')

                        # Create dataframe after collecting all the data
                        df = pd.DataFrame.from_dict(data)                    

                        # Convert results into a .csv and marks file as scraped
                        self.convert_to_csv(file, cat.text, df)
                        self.add_to_scraped_file('--' + file)
    
    def add_to_scraped_file(self, text):
        if not self.check_if_scraped(text):
            with open(ALREADY_SCRAPED, 'a') as file:
                file.write(f'{text}\n')
                return
    
    def check_if_scraped(self, text, file = ''):
        with open(ALREADY_SCRAPED, 'r') as f:
            done = [x.strip() for x in f.readlines()]
            
        if file:
            if text in done and os.path.exists(file):
                return True
            return False
        return text in done

    def scrape_results(self, result, cat):
        details = result.find_elements(By.TAG_NAME, 'td')
        if "LEAD" in cat or "BOULDER" in cat:
            try:
                temp_dict = {
                    "Rank": details[0].text,
                    "Name": f"{details[1].text} {details[2].text}",
                    "Country": details[3].text,
                    "Qualification": details[4].text,
                    "Semi-Final": details[5].text,
                    "Final": details[6].text
                }
            except:
                return False
        elif "SPEED" in cat:
            try:
                temp_dict = {
                    "Rank": details[0].text,
                    "Name": f"{details[1].text} {details[2].text}",
                    "Country": details[3].text,
                    "Qualification": details[4].text,
                    "Final": details[5].text
                }
            except:
                return False
        else:
            try:
                temp_dict = {
                    "Rank": details[0].text,
                    "Name": f"{details[1].text} {details[2].text}",
                    "Country": details[3].text,
                    "Qualification": details[4].text
                }
            except:
                return False
        return temp_dict
    
    def get_dropdowns(self, driver):
        year_dd   = driver.find_element(By.XPATH, '//select[@id="years"]')
        league_dd = driver.find_element(By.XPATH, '//select[@id="indexes"]')
        event_dd  = driver.find_element(By.XPATH, '//select[@id="events"]')
        cat_dd    = driver.find_element(By.XPATH, '//select[@id="categories"]')        
        return year_dd, league_dd, event_dd, cat_dd
    
    def get_events(self, driver, events_dd):
        event_opts = Select(events_dd)
        wait = WebDriverWait(driver, 10)
        wait.until(lambda d: len(event_opts.options) > 1)                    
        return [x.text for x in event_opts.options[1:]]
    
    def generate_filename(self, packed_data):
        # Unpacks data
        (category, event, date) = packed_data

        # Create filename in form of {date}_{event}_{category}
        date = ' '.join(date.split()[::-1][:2])       

        # Cleans up event name for next part
        event = event.replace('- ', '').split()
        if event[-1] == 'CANCELLED':
            event = ' '.join(event[:-2])
        else:
            event = ' '.join(event[:-1])

        # Uses Regex to clean because not every name has the same format
        filename = ' '.join([date, event, category])
        filename = re.findall("^[^\(]+|[\(].*", filename)
        filename[1] = filename[1].split(') ', 1)[1]
        filename = (''.join(filename)
                    .replace('(','[')
                    .replace(')',']')
                    .replace(' ', '_')
                    .replace(',', '')
                    .lower()) + '.csv'        
        return filename

    def convert_to_csv(self, filename, category, data):
        # Figure out correct directory
        path = self.get_dir(category)
        file = path + f'\\{filename}'

        # Generates .csv with filename
        data.to_csv(file, index=False)
            
    def get_dir(self, category):
        base = category.upper().split()
        if "MEN" in base:
            if "BOULDER" in base: return BOULDER_MEN_DIR
            if "LEAD" in base: return LEAD_MEN_DIR
            if "SPEED" in base: return SPEED_MEN_DIR
            if "COMBINED" in base: return COMBINED_MEN_DIR
            if "BOULDER&LEAD" in base: return BOULDER_AND_LEAD_MEN_DIR
        if "WOMEN" in base:
            if "BOULDER" in base: return BOULDER_WOMEN_DIR
            if "LEAD" in base: return LEAD_WOMEN_DIR
            if "SPEED" in base: return SPEED_WOMEN_DIR
            if "COMBINED" in base: return COMBINED_WOMEN_DIR
            if "BOULDER&LEAD" in base: return BOULDER_AND_LEAD_WOMEN_DIR
        
    def end_session(self):
        self.driver.quit()
    
    def scrape_site(self):
        self.get_years_events(self.get_comp_years_and_league())
        self.end_session()
        
def display_events(dict):
        for k in dict:
            print(f'{int(k)}:')
            for v in dict[k]:
                print('    ', v)

In [5]:
scraper = IFSCScraper()
# years = scraper.get_year_list()
# for year in years[12:]:
#     scraper.get_single_year(year)
scraper.get_single_year('2022')
scraper.end_session()
# scraper.scrape_site()

Scraping 2022...
Already scraped IFSC - Climbing World Cup (B) - Meiringen (SUI) 2022!
Already scraped IFSC - Climbing World Cup (B,S) - Seoul (KOR) 2022!
Already scraped IFSC - Climbing World Cup (B,S) - Salt Lake City (USA) 2022!
Already scraped IFSC - Climbing World Cup (B) - Brixen (ITA) 2022!
Already scraped IFSC - Climbing World Cup (B,L) - Innsbruck (AUT) 2022!
Already scraped IFSC - Climbing World Cup (L,S) - Villars (SUI) 2022!
Already scraped IFSC - Climbing World Cup (L,S) - Chamonix (FRA) 2022!
Already scraped IFSC - Climbing World Cup (L) - Briançon (FRA) 2022!
Already scraped IFSC - Climbing World Cup (L) - Koper (SLO) 2022!
Already scraped IFSC - Climbing World Cup (L,S) - Edinburgh (GBR) 2022!
Already scraped IFSC - Climbing World Cup (L,S) - Jakarta (INA) 2022!
Already scraped IFSC - Climbing World Cup (B&L) - Morioka, Iwate (JPN) 2022!
