In [79]:
import requests
from bs4 import BeautifulSoup
import pandas as pd


In [80]:
url = "https://results.checkpointspot.asia/results.aspx?CId=17036&RId=10510&EId=40" #ensure that ur network doesnt block this
#params below is for the file naming convention
eventname = "Asia Triathlon Cup 2025"
startdate = "20250222"
category = "SprintAquathlon5059F"


In [81]:

class WebScraper:
    """
    A class for scraping data from web pages.

    Attributes:
        start_url (str): The URL of the web page to scrape.
        soup (BeautifulSoup): The BeautifulSoup object representing the parsed HTML content.
        tables (list): A list of BeautifulSoup Tag objects representing the tables in the web page.

    Methods:
        fetch_data(): Fetches the HTML content of the web page and parses it using BeautifulSoup.
        get_data_table(): Retrieves the data table from the web page.
        get_page_list(): Retrieves the list of URLs from the web page.
    """

    def __init__(self, start_url):
        self.start_url = start_url
        self.base_url = ""
        # Initialize variables
        self.soup = None
        self.tables = []
        self.data = None  # This will store the dataframe
        self.other_pages_url = []
        self.get_base_url()
        self.fetch_data()

    def fetch_data(self):
        """
        Fetches the HTML content of the web page and parses it using BeautifulSoup.
        """
        try:
            response = requests.get(self.start_url)
            response.raise_for_status()  # Raise an exception if the request was unsuccessful
            self.soup = BeautifulSoup(response.content, 'html.parser')
            self.tables = self.soup.find_all("table")
        except requests.RequestException as e:
            print(f"Error fetching URL: {e}")

    def get_base_url(self):
        """
        Removes the strings after ".com/" in a given URL.

        Args:
            url (str): The URL to process.

        Returns:
            str: The URL with the strings after ".com/" removed.
        """
        index = self.start_url.find(".com/")
        if index != -1:
            self.base_url = self.start_url[:index + 5]
        else:
            self.base_url = self.start_url


    def get_data_table(self):
        """
        Retrieves the data table from the web page.

        Returns:
            pandas.DataFrame: The data table as a pandas DataFrame.
        """
        # Assuming the first table is the data table
        if len(self.tables) < 2:
            print("Data table not found.")
            return None
        rows = self.tables[1].find_all("tr")
        rows_list = []
        for row in rows:
            cols = row.find_all("td")
            cols_list = [col.text for col in cols if "d-sm-none" not in col.get("class", [])]
            if cols_list:  # Ensure the list is not empty
                rows_list.append(cols_list)
        if not rows_list:
            print("No data found in the table.")
            return None
        df = pd.DataFrame(rows_list)
        df.columns = df.iloc[0]
        df = df.iloc[1:, :]
        return df

    def get_page_list(self):
        """
        Retrieves the list of URLs from the web page.

        Returns:
            list: A list of URLs.
        """
        # Assuming the first table contains links to other pages
        if not self.tables:
            print("No tables found.")
            return None
        links = [a['href'] for a in self.tables[0].find_all("a", href=True)]
        if not links:
            print("No URLs found.")
            return None
        # You might want to handle relative URLs here   
        links_updated = []
        for link in links:
            links_updated.append(self.base_url + link)


        self.other_pages_url = links_updated
        return links_updated

class PageIterator(WebScraper):
    """
    A class that iterates through a list of page URLs and fetches data tables from each page.

    Attributes:
    - page_list (list): A list of page URLs to iterate through.
    - current_page (int): The index of the current page being processed.
    - tables (list): A list to store all tables from all pages.

    Methods:
    - __init__(self, urls): Initializes the PageIterator object with a list of page URLs.
    - fetch_all_pages(self): Fetches data tables from all pages in the page_list.
    """

    def __init__(self, urls):
        self.page_list = urls
        self.current_page = 0
        self.tables = []  # Store all tables from all pages
        self.fetch_all_pages()

    def fetch_all_pages(self):
        """
        Fetches data tables from all pages in the page_list.

        Returns:
        - tables (list): A list of data tables fetched from all pages.
        """
        for page_url in self.page_list:
            self.web_scraper = WebScraper(page_url)
            table = self.web_scraper.get_data_table()
            if table is not None:
                self.tables.append(table)

        if len(self.tables) == 0:
            print("No tables found.")

        return self.tables



In [82]:
# Create a WebScraper object with the start URL
km1 = WebScraper(start_url=url)

# Get the data table from the start URL
data = km1.get_data_table()

# Get the list of other pages to scrape
page_list = km1.get_page_list()

if page_list is None:
    print("No other pages to scrape.")
    df_all = data
    
else:
    # Create a PageIterator object with the list of pages
    other_data = PageIterator(page_list)

    # Get the tables from the other pages
    all_data = other_data.tables

    # Append the data table from the start URL to the list of all data tables
    all_data.append(data)

    # Concatenate all data tables into a single DataFrame
    df_all = pd.concat(all_data)

# Remove rows where the 'Pos' column is empty
df_all = df_all[df_all['Pos'] != '']

# Remove rows where the 'Pos' column is NaN
df_all = df_all.dropna(subset=['Pos'])

# Convert the 'Pos' column to integer type
df_all['Pos'] = df_all['Pos'].astype(int)

# Sort the DataFrame by the 'Pos' column
df_all = df_all.sort_values(by=['Pos'])

# Remove rows where the 'Time' column does not contain a colon (i.e., is not in HH:MM:SS format)
df_all = df_all[df_all['Time'].str.contains(':')]

df_all["Category"] = category



No URLs found.
No other pages to scrape.


In [83]:
#Save the file to csv. the format should be [eventname]_[startdate]_[category].csv

df_all.to_csv(f"../Dataset/{eventname}_{startdate}_{category}.csv", index=False)