In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

class WebScraper:
    """
    A class for scraping data from web pages.

    Attributes:
        start_url (str): The URL of the web page to scrape.
        soup (BeautifulSoup): The BeautifulSoup object representing the parsed HTML content.
        tables (list): A list of BeautifulSoup Tag objects representing the tables in the web page.

    Methods:
        fetch_data(): Fetches the HTML content of the web page and parses it using BeautifulSoup.
        get_data_table(): Retrieves the data table from the web page.
        get_page_list(): Retrieves the list of URLs from the web page.
        iterate_over_pages(): Iterates over each URL in the list and processes the data table.
    """

    def __init__(self, start_url):
        self.start_url = start_url
        self.base_url = ""
        self.soup = None
        self.tables = []
        self.data = None #this will store dataframe
        self.get_base_url()
        self.fetch_data()

    def fetch_data(self):
        """
        Fetches the HTML content of the web page and parses it using BeautifulSoup.
        """
        try:
            response = requests.get(self.start_url)
            response.raise_for_status() # Raise an exception if the request was unsuccessful
            self.soup = BeautifulSoup(response.content, 'html.parser')
            self.tables = self.soup.find_all("table")
        except requests.RequestException as e:
            print(f"Error fetching URL: {e}")

    def get_base_url(self):
        """
        Removes the strings after ".com/" in a given URL.

        Args:
            url (str): The URL to process.

        Returns:
            str: The URL with the strings after ".com/" removed.
        """
        index = self.url.find(".com/")
        if index != -1:
            self.base_url =  self.url[:index + 5]
        self.base_url = url


    def get_data_table(self):
        """
        Retrieves the data table from the web page.

        Returns:
            pandas.DataFrame: The data table as a pandas DataFrame.
        """
        # Assuming the first table is the data table
        if len(self.tables) < 2:
            print("Data table not found.")
            return None
        rows = self.tables[1].find_all("tr")
        rows_list = []
        for row in rows:
            cols = row.find_all("td")
            cols_list = [col.text for col in cols if "d-sm-none" not in col.get("class", [])]
            if cols_list: # Ensure the list is not empty
                rows_list.append(cols_list)
        if not rows_list:
            print("No data found in the table.")
            return None
        df = pd.DataFrame(rows_list)
        df.columns = df.iloc[0]
        df = df.iloc[1:, :]
        return df

    def get_page_list(self):
        """
        Retrieves the list of URLs from the web page.

        Returns:
            list: A list of URLs.
        """
        # Assuming the first table contains links to other pages
        if not self.tables:
            print("No tables found.")
            return None
        links = [a['href'] for a in self.tables[0].find_all("a", href=True)]
        return links

    # def iterate_over_pages(self):
    #     """
    #     Iterates over each URL in the list and processes the data table.
    #     """
    #     # Get the list of URLs from the first page
    #     urls = self.get_page_list()
    #     if urls is None:
    #         print("No URLs found for iteration.")
    #         return
    #     # Iterate over each URL and process it
    #     for url in urls:
    #         # You might want to handle relative URLs here
    #         self.start_url = url
    #         self.fetch_data()
    #         data_table = self.get_data_table()
    #         if data_table is not None:
    #             # Process the data table as needed
    #             print(data_table)

In [2]:
url = "https://runnersunite.racetecresults.com/results.aspx?CId=16634&RId=1189&EId=2"
km1 = WebScraper(start_url=url)

In [3]:
data   = km1.get_data_table()
page_list = km1.get_page_list()

page_list

['results.aspx?CId=16634&RId=1189&EId=2&dt=0&PageNo=2',
 'results.aspx?CId=16634&RId=1189&EId=2&dt=0&PageNo=3']

In [4]:

new_string = remove_strings_after_com(url)
new_string



'https://runnersunite.racetecresults.com/'