In [None]:
import ssl
import urllib.request
import re
import pandas as pd
from bs4 import BeautifulSoup
from google.colab import files
import csv
import time
from os import path, makedirs 

# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

# Finding the number of events in the catalog
main_url = "https://www.endurance-data.com/en/competitions/1/"
main_html = urllib.request.urlopen(main_url, context=ctx).read()
main_soup = BeautifulSoup(main_html, 'html.parser')
pagination = main_soup(class_='page-item')
num_pages = int(pagination[3].text)
dir_url = "https://www.endurance-data.com/en/competitions/{}/"
complete_race_info = []

# Extracting information from each race
for i in range(1, num_pages + 1):
    dir_html = urllib.request.urlopen(dir_url.format(i), context=ctx).read()
    dir_soup = BeautifulSoup(dir_html, 'html.parser')
    races = dir_soup.find_all(class_='cursor-pointer')

    for race in races:
        race_stats = []
        race_info = race.find_all('td')
        race_results = race.find_all('a')
        for info in race_info:
            try:
                if not re.match('[\n]+', info.text):
                    race_stats.append(info.text)
            except Exception:
                continue
        race_stats.append(race_results[2]['href'])
        complete_race_info.append(race_stats)

all_races_dataframe = pd.DataFrame.from_records(complete_race_info)
all_races_dataframe.rename(columns={
    0: 'Event',
    1: 'Date',
    2: 'Location',
    3: 'Athletes',
    4: 'Results'
}, inplace=True)
all_races_dataframe.to_csv('./2015_present.csv', index=False)
files.download('2015_present.csv')


In [None]:
# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

# Getting time to find program execution time
start_time = time.time()

# Reading in directory csv as a list
with open('2015_present.csv', newline='')as f:
    reader = csv.reader(f)
    all_races = list(reader)

base_url = "https://www.endurance-data.com"
# Building csvs for each race
for race in all_races[2:]:
    # Replacing non file directory compatible items
    print(race)
    race_name = race[0].replace(" ", "_")
    race_name = race_name.replace(".","_")
    race_date = race[1].replace("/","_")

    # Skipping race if already recorded
    if path.exists('races/'+race_name+race_date+".csv"):
        continue
    # Setting up pagination navigation loop
    race_url = base_url + race[-1] + '{}'
    main_html = urllib.request.urlopen(race_url.format(1), context=ctx).read()
    main_soup = BeautifulSoup(main_html, 'html.parser')
    pagination = main_soup(class_='page-item')

    # Check if pagination list has at least 2 elements before accessing [-2]
    if len(pagination) >= 2:
        num_pages = int(pagination[-2].text)
    else:
        # Handle case where pagination is shorter than expected, e.g., set num_pages to 1
        num_pages = 1

    ironman_results = []
    for i in range(1, num_pages + 1):
        html = urllib.request.urlopen(race_url.format(i), context=ctx).read()
        soup = BeautifulSoup(html, 'html.parser')
        athletes = soup.find_all(class_='pointer')
        # Building athlete data
        for athlete in athletes:
            athleteStats = []
            for stat in athlete:
                try:
                    athleteStats.append(stat.text)
                except AttributeError:
                    continue
            ironman_results.append(athleteStats)

        ironman_dataFrame = pd.DataFrame.from_records(ironman_results)
        # Check if the DataFrame is empty before dropping columns
        if not ironman_dataFrame.empty:
            ironman_dataFrame.drop(ironman_dataFrame.columns[[0, 1]], axis=1, inplace=True)
            ironman_dataFrame.rename(columns={2: 'Place',
                                              3: 'Name',
                                              4: 'Bib',
                                              5: 'Division',
                                              6: 'Nation',
                                              7: 'Swim',
                                              8: 'Bike',
                                              9: 'Run',
                                              10: 'Time'}, inplace=True)
            ironman_dataFrame['Race'] = race[0]
            ironman_dataFrame['Date'] = race[1]
            ironman_dataFrame['Location'] = race[2]
        else:
            # Handle empty DataFrame, e.g., print a message or skip processing
            print(f"No athletes found for race: {race[0]} on {race[1]}")

    # Saving race dataframe into csv's
    # Create the 'races' directory if it doesn't exist
    makedirs('races', exist_ok=True) # Create directory if not present

    # Check if the DataFrame is not empty before saving it to CSV
    if not ironman_dataFrame.empty:
        ironman_dataFrame.to_csv("races/"+race_name+race_date+'.csv',index=False)
        print(ironman_dataFrame)

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:

# Get the list of files in the 'races' directory
races_folder = 'races'
if os.path.exists(races_folder):
    race_files = os.listdir(races_folder)
    print(f"Files in the '{races_folder}' directory:")
    for file in race_files:
        print(file)
else:
    print(f"The '{races_folder}' directory does not exist.")
