# Data Gathering
## Import Libraries & Tools

In [1]:
import pandas as pd
import numpy as np
import time
import glob
import re
import os

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

from datetime import date

## Create Directory Structure

In [2]:
DATA_DIR = os.path.join(
    os.path.dirname(os.path.realpath("__file__")), "data"
)

BOULDER_MEN_DIR            = os.path.join(DATA_DIR, "Boulder/Men")
BOULDER_WOMEN_DIR          = os.path.join(DATA_DIR, "Boulder/Women")
LEAD_MEN_DIR               = os.path.join(DATA_DIR, "Lead/Men")
LEAD_WOMEN_DIR             = os.path.join(DATA_DIR, "Lead/Women")
SPEED_MEN_DIR              = os.path.join(DATA_DIR, "Speed/Men")
SPEED_WOMEN_DIR            = os.path.join(DATA_DIR, "Speed/Women")
COMBINED_MEN_DIR           = os.path.join(DATA_DIR, "Combined/Men")
COMBINED_WOMEN_DIR         = os.path.join(DATA_DIR, "Combined/Women")
BOULDER_AND_LEAD_MEN_DIR   = os.path.join(DATA_DIR, "Boulder & Lead/Men")
BOULDER_AND_LEAD_WOMEN_DIR = os.path.join(DATA_DIR, "Boulder & Lead/Women")

dirs = [BOULDER_MEN_DIR, BOULDER_WOMEN_DIR, LEAD_MEN_DIR, LEAD_WOMEN_DIR,
       SPEED_MEN_DIR, SPEED_WOMEN_DIR, COMBINED_MEN_DIR, COMBINED_WOMEN_DIR,
       BOULDER_AND_LEAD_MEN_DIR, BOULDER_AND_LEAD_WOMEN_DIR]

# Create directory if it doesn't exist
for dir in dirs:
    if not os.path.exists(dir):
        os.makedirs(dir)
        
# File to store names of events that have already been scraped
try:
    ALREADY_SCRAPED = os.path.join(DATA_DIR, "scraped_events.txt")
    # Create file
    with open(ALREADY_SCRAPED, 'x') as fp:
        pass
except:
    if os.stat(ALREADY_SCRAPED).st_size == 0:
        print("No data has been scraped yet!")

## IFSCScraper Class Definition

In [3]:
class IFSCScraper():
    """
    Define a class for the scraper that will be used to gather data from the IFSC website
    (ifsc-climbing.org). Includes methods that allow for scraping different pages and 
    different information.
    """
    # Page url
    url  = 'https://www.ifsc-climbing.org/index.php/world-competition/last-result'
    url2 = 'https://ifsc.results.info/#/athlete/'
    
    def __init__(self):
        """
        Initialize a scraper object with its own browser instance.
        
        debug: indicates whether this is a debug instance for quicker development
        """
        self.generate_driver()
        time.sleep(1)
    
    def generate_driver(self):
        """
        Initialize Selenium web browser.
        """
        try:
            # Sets headless option, so we don't see browser
            headOption = webdriver.FirefoxOptions()
            headOption.add_argument("--headless")
            self.driver = webdriver.Firefox(options=headOption)
            wait = WebDriverWait(self.driver, 10)
        except Exception as ex:
            template = "An exception of type {0} occurred. Arguments:\n{1!r}"
            message = template.format(type(ex).__name__, ex.args)
            print(message)
        
    def load_page(self, link, athlete_page=0, timeout=10, wait_after=1):
        """
        Helper function that opens browser, loads a page, and waits for timeout.
        
        link: link to the page we wish to load
        athlete_page: flag to load different url
        timeout: seconds to wait before timing out
        wait_after: seconds to wait after loading
        """

        # Visit link
        self.driver.get(link)
        wait = WebDriverWait(self.driver, timeout)

        # Attempt to open link
        try:
            if athlete_page:
                wait.until(EC.visibility_of_element_located((By.XPATH, "//div[@class='athlete-info left-side']")))
            else:
                wait.until(EC.visibility_of_element_located((By.XPATH, "//div[@class='uk-container']")))
        except TimeoutException:
            print("Timed out waiting for page " + link + " to load")
            self.driver.quit()

        # Wait for page to load
        time.sleep(wait_after)
    
    def get_year_list(self):
        """
        Opens browser and gets list of all years listed on IFSC website.
        
        Returns list of years (as strings)
        """
        try:
            self.load_page(IFSCScraper.url)            
            wait = WebDriverWait(self.driver, 5)
            
            # The data we are after resides within an iframe
            wait.until(EC.frame_to_be_available_and_switch_to_it((By.CSS_SELECTOR, "iframe.jch-lazyloaded")))
        except:
            print('Error loading page!')
            
        # Dropdown menus for each choice
        year_dd, league_dd, event_dd, cat_dd = self.get_dropdowns(self.driver)
        year_opts = Select(year_dd).options
        return [year.text for year in year_opts]
    
    def get_single_year(self, year='2022'):
        """
        Fully scrape each event and category for a given year.
        
        year: string of the year you want to scrape
        """
        try:
            self.load_page(IFSCScraper.url)            
            wait = WebDriverWait(self.driver, 8)
            
            # The data we are after resides within an iframe
            wait.until(EC.frame_to_be_available_and_switch_to_it((By.CSS_SELECTOR, "iframe.jch-lazyloaded")))
            print(f'Scraping {year}...')
        except:
            print('Error loading page!')        
        
        # Dropdown menus for each choice
        year_dd, league_dd, event_dd, cat_dd = self.get_dropdowns(self.driver)
        
        # Select given year and league
        year_ob   = Select(year_dd).select_by_visible_text(year)
        league_ob = Select(league_dd).select_by_index(1)
        
        # Get list of all events for the year
        all_events = self.get_events(self.driver, event_dd)
        
        # Loop through each event, scrape results, and generate .csv file
        dfs = []
        for i, event in enumerate(all_events):
            # Implement check to see if event has already been scraped
            if self.check_if_scraped(event) and all_events[i] != all_events[i-1]:
                print(f'--Already scraped {event}!')
                continue
            # Some of the events aren't actually events, but more qualification rounds, and 
            # they don't list the results correctly, which will cause errors. The common thread
            # is the naming of them.
            elif event.count('(') < 1:
                print(f'--Skipping {event}...')
                continue
            else:
                # Set this flag for special cases where two events share the same name (rare)
                same_event_name = True if all_events[i] == all_events[i-1] else False
                                    
                # Select event
                if same_event_name:
                    event_ob = Select(event_dd).select_by_index(i+1)                    
                else:
                    event_ob = Select(event_dd).select_by_visible_text(event)
                    
                category_select = Select(cat_dd)

                # Some events were cancelled or don't have results listed, check for it here
                try:
                    wait.until(lambda d: len(category_select.options) > 1)
                    if not self.check_if_scraped(event):
                        # self.add_to_scraped_file(event)
                        print(f'--Scraping {event}...')
                except:
                    print(f'--No data for {event}!')
                    continue

                # Get results for each category
                for cat in category_select.options[1:]:
                    cat_ob = Select(cat_dd).select_by_visible_text(cat.text) # selects category

                    # Finds table with desired data
                    try:
                        wait.until(EC.visibility_of_element_located((By.XPATH, '//div[@id="table_id_wrapper"]')))
                    except:
                        print(f'----No data for {cat.text}!')
                        continue

                    table_wrapper = self.driver.find_element(By.XPATH, '//div[@id="table_id_wrapper"]')
                    results = table_wrapper.find_element(By.TAG_NAME, 'tbody').find_elements(By.TAG_NAME, 'tr')

                    # Get event name and date
                    event_details = self.driver.find_element(By.XPATH, '//div[@class="labels"]')
                    event_results = event_details.find_elements(By.TAG_NAME, 'p') # Event title & date

                    # Sets event name in case of duplicate
                    if same_event_name:
                        event = event[:-10] + '2 ' + event[-10:]
                        same_event_name = False
                    
                    # Generate correct filename
                    file = self.generate_filename((cat.text, event_results[0].text, event_results[1].text))
                    text = '--' + file
                    path = self.get_dir(cat.text)                    
                    filepath = os.path.join(path, file)
                                        
                    # Checks if the filename has been added to the .txt, AND if the file exists
                    if self.check_if_scraped(text, filepath):
                        continue
                    else:
                        print(f'----Scraping {cat.text}...')
                        
                        # Data (list of dictionaries) contains each climber's results
                        data = []
                        for result in results:
                            # Each climber's result stored in dict
                            temp_dict = self.scrape_results(event, result, cat.text)
                            
                            if temp_dict:
                                data.append(temp_dict)
                            else:
                                print(f'----Data format error for {cat.text}! Cannot parse it further. Skipping...')
                                break
                        
                        if data:
                            # Convert raw results into a .csv and marks file as scraped
                            df = pd.DataFrame.from_dict(data)                            
                            self.convert_to_csv(file, cat.text, df)
                            self.add_to_scraped_file('--' + file)
                        
                # All categories for the event have been scraped
                self.add_to_scraped_file(event)
                        
    def get_athlete_height(self, athlete_id):
        """
        Scrape athlete page for climber's height
        
        athlete_id: unique id assigned to each climber
        Returns height of given climber
        """
        try:
            self.load_page(IFSCScraper.url2 + athlete_id, athlete_page=1)            
        except:
            print('Error loading page!')
            self.driver.quit()
        
        height_div = self.driver.find_element(By.XPATH, '//div[@class="athlete-info left-side"]')
        height_ele = height_div.find_elements(By.TAG_NAME, 'div')[1]
        return height_ele.text.split(': ')[1]
    
    def add_to_scraped_file(self, text):
        """
        Adds given text to our tracker file to keep track of what has been scraped.
        
        text: string of text to add to file
        """
        if not self.check_if_scraped(text):
            with open(ALREADY_SCRAPED, 'a') as file:
                file.write(f'{text}\n')
                return
    
    def check_if_scraped(self, text, file = ''):
        """
        Checks if the given text exists in the given file.
        
        text: string of text to check in file
        file: name of file to check for existence
        Returns true/false
        """
        with open(ALREADY_SCRAPED, 'r') as f:
            done = [x.strip() for x in f.readlines()]
            
        if file:
            if text in done and os.path.exists(file):
                return True
            return False
        return text in done

    def scrape_results(self, event, result, cat):
        """
        Function to scrape the desired results from a given event and category.
        
        event: name of event to scrape
        result: individual climber's results for given event
        cat: category (men's or women's)
        Returns dictionary with correct result format based on category
        """
        details = result.find_elements(By.TAG_NAME, 'td')
        athlete_id = details[1].find_element(By.TAG_NAME, 'a').get_attribute('href').split('id=')[1]

        if "LEAD" in cat or "BOULDER" in cat:
            try:                
                temp_dict = {
                    "Event": event,
                    "ID": athlete_id,
                    "Rank": details[0].text,
                    "Name": f"{details[1].text} {details[2].text}",
                    "Gender": 'F' if 'WOMEN' in cat.upper() else 'M',
                    "Country": details[3].text,
                    "Qualification": details[4].text,
                    "Semi-Final": details[5].text
                }
                if len(details) == 7: #each round has data
                    temp_dict["Final"] = details[6].text
                else: # semi-final round acts as final
                    temp_dict["Final"] = details[5].text
                
            except:
                return False
        elif "SPEED" in cat:
            try:
                temp_dict = {
                    "Event": event,
                    "ID": athlete_id,
                    "Rank": details[0].text,
                    "Name": f"{details[1].text} {details[2].text}",
                    "Gender": 'F' if 'WOMEN' in cat.upper() else 'M',
                    "Country": details[3].text,
                    "Qualification": details[4].text,
                    "Final": details[5].text
                }
            except:
                return False
        else:
            try:
                temp_dict = {
                    "Event": event,
                    "ID": athlete_id,
                    "Rank": details[0].text,
                    "Name": f"{details[1].text} {details[2].text}",
                    "Gender": 'F' if 'WOMEN' in cat.upper() else 'M',
                    "Country": details[3].text,
                    "Qualification": details[4].text
                }
            except:
                return False
        return temp_dict
    
    def get_dropdowns(self, driver):
        """
        Helper function to quickly find the four dropdown menus on the page.
        
        driver: selenium web driver
        Returns one web element for each dropdown menu
        """
        year_dd   = driver.find_element(By.XPATH, '//select[@id="years"]')
        league_dd = driver.find_element(By.XPATH, '//select[@id="indexes"]')
        event_dd  = driver.find_element(By.XPATH, '//select[@id="events"]')
        cat_dd    = driver.find_element(By.XPATH, '//select[@id="categories"]')        
        return year_dd, league_dd, event_dd, cat_dd
    
    def get_events(self, driver, events_dd):
        """
        Function to get a list of all events.
        
        driver: selenium web element
        events_dd: dropdown element for the event menu
        Returns list of events in the events_dd menu
        """
        event_opts = Select(events_dd)
        wait = WebDriverWait(driver, 10)
        wait.until(lambda d: len(event_opts.options) > 1)                    
        return [x.text for x in event_opts.options[1:]]
    
    def generate_filename(self, packed_data):
        """
        Function to take the event name and convert it to a consistent, formatted filename.
        
        packed_data: tuple containing all necessary info to generate the filename
        """
        # Unpacks data
        (category, event, date) = packed_data

        # Create filename in form of {date}_{event}_{category}
        date = ' '.join(date.split()[::-1][:3])       

        # Cleans up event name for next part
        event = event.replace('- ', '').split()
        if event[-1] == 'CANCELLED':
            event = ' '.join(event[:-2])
        else:
            event = ' '.join(event[:-1])

        # Uses Regex to clean because not every name has the same format
        filename = ' '.join([date, event, category])
        filename = re.findall("^[^\(]+|[\(].*", filename)
        filename[1] = filename[1].split(') ', 1)[1]
        filename = (''.join(filename)
                    .replace('(','[')
                    .replace(')',']')
                    .replace(' ', '_')
                    .replace(',', '')
                    .lower()) + '.csv'        
        return filename

    def convert_to_csv(self, filename, category, data):
        """
        Function to take the given data and create a .csv with the given filename
        in the given category directory.
        
        filename: filename generated by generate_filename function
        category: men's or women's (used to differentiate directory)
        data: dataframe we want to save
        """
        # Figure out correct directory
        path = self.get_dir(category)
        file = path + f'\\{filename}'

        # Generates .csv with filename
        data.to_csv(file, index=False)
            
    def get_dir(self, category):
        """
        Function to return the proper directory to use, based on category.
        
        category: men's or women's
        """
        base = category.upper().split()
        if "MEN" in base:
            if "BOULDER" in base: return BOULDER_MEN_DIR
            if "LEAD" in base: return LEAD_MEN_DIR
            if "SPEED" in base: return SPEED_MEN_DIR
            if "COMBINED" in base: return COMBINED_MEN_DIR
            if "BOULDER&LEAD" in base: return BOULDER_AND_LEAD_MEN_DIR
        if "WOMEN" in base:
            if "BOULDER" in base: return BOULDER_WOMEN_DIR
            if "LEAD" in base: return LEAD_WOMEN_DIR
            if "SPEED" in base: return SPEED_WOMEN_DIR
            if "COMBINED" in base: return COMBINED_WOMEN_DIR
            if "BOULDER&LEAD" in base: return BOULDER_AND_LEAD_WOMEN_DIR
                
    def end_session(self):
        """Basic function to end the selenium web driver and close the browser."""
        print('SESSION DONE! Quitting webdriver and closing browser...')
        self.driver.quit()
        
    def scrape_all_ifsc_world_cups(self):
        """
        Complete function to go through each year and scrape all IFSC World Cup
        competitions. Currently does not include 2023 because those events have
        not happened yet.
        """
        years = self.get_year_list()
        for year in years[:-17]: # 2023-2007, 2023 events ongoing
            scraper.get_single_year(year)
        self.end_session()

In [4]:
# Takes ~37 minutes to scrape all years and events
scraper = IFSCScraper()
# scraper.scrape_all_ifsc_world_cups() # Run once to gather everything
scraper.get_single_year(scraper.get_year_list()[0]) # Gets results for most recent year
scraper.end_session()

Scraping 2023...
--Already scraped IFSC - Climbing World Cup (B) - Hachioji (JPN) 2023!
--Already scraped IFSC - Climbing World Cup (B,S) - Seoul (KOR) 2023!
--Already scraped IFSC - Climbing World Cup (S) - Jakarta (INA) 2023!
--Scraping IFSC - Climbing World Cup (B,S) - Salt Lake City (USA) 2023...
----Scraping BOULDER Men...
----Scraping BOULDER Women...
----Scraping SPEED Men...
----Scraping SPEED Women...
--No data for IFSC - Climbing World Cup (B) - Prague (CZE) 2023!
--No data for IFSC - Climbing World Cup (B) - Brixen (ITA) 2023!
--No data for IFSC - Climbing World Cup (B,L) - Innsbruck (AUT) 2023!
--No data for IFSC - Climbing World Cup (L,S) - Villars (SUI) 2023!
--No data for IFSC - Climbing World Cup (L,S) - Chamonix (FRA) 2023!
--No data for IFSC - Climbing World Cup (L) - Briançon (FRA) 2023!
--No data for IFSC - Climbing World Championships (B,L,S,B&L) - Bern (SUI) 2023!
--No data for IFSC - Climbing World Cup (L) - Koper (SLO) 2023!
--No data for IFSC - Climbing World C