In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin
import logging
import configparser
import requests_cache

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')

# Load configuration from a file
class ConfigLoader:
    @staticmethod
    def load_config(filename):
        config = configparser.ConfigParser()
        with open(filename) as f:
            config.read_file(f)
        return config


class TableScraper:
    def __init__(self, page, config):
        """
        Initializes the TableScraper class.

        Args:
            page (int): The page number to scrape.
            config (ConfigParser): The configuration parser object.
        """
        self.base_url = config.get('Scraper', 'base_url')
        self.page = page
        self.url = self.construct_url()
        self.data = []

    def construct_url(self):
        """
        Constructs the URL for scraping based on the page number.

        Returns:
            str: The constructed URL.
        """
        params = {"page": self.page}
        url = urljoin(self.base_url, "?".join([self.base_url, "&".join(f"{k}={v}" for k, v in params.items())]))
        return url

    def fetch_html_content(self):
        """
        Fetches the HTML content of the webpage.
        """
        try:
            response = self.session.get(self.url)
            response.raise_for_status()  # Raise an exception if the request was not successful
            self.html_content = response.content
        except requests.RequestException as e:
            logging.error(f"Error occurred while fetching HTML content: {e}")

    def parse_html_content(self):
        """
        Parses the HTML content using BeautifulSoup.
        """
        soup = BeautifulSoup(self.html_content, 'html.parser')
        self.table = soup.find('table', class_='list')

    def scrape_table_data(self):
        """
        Scrapes the table data and stores it in a list of dictionaries.
        """
        if self.table is None:
            logging.warning("No table found on the page.")
            return

        rows = self.table.select('.pr-list-page-row')

        for row in rows:
            self.data.append(self.extract_row_data(row))

    def extract_row_data(self, row):
        """
        Extracts data from a table row and returns a dictionary.

        Args:
            row (BeautifulSoup): The BeautifulSoup object representing a table row.

        Returns:
            dict: The extracted data as a dictionary.
        """
        date_element = row.select_one('.datetime')
        date = date_element.text.strip() if date_element else None

        respondents_element = row.select_one('.release-view__respondents')
        respondents = respondents_element.text.strip() if respondents_element else None

        release_numbers_element = row.select_one('.view-table_subfield.view-table_subfield_release_number')
        release_numbers = release_numbers_element.text.replace('Release No.', '').strip() if release_numbers_element else None

        link_element = respondents_element.find('a')
        link = urljoin(self.base_url, link_element['href']) if link_element else None

        return {'Date': date, 'Respondents': respondents, 'Release Numbers': release_numbers, 'Link': link}

    def create_dataframe(self):
        """
        Creates a DataFrame from the scraped data.

        Returns:
            pd.DataFrame: The DataFrame containing the scraped data.
        """
        df = pd.DataFrame(self.data)
        return df

    def scrape_and_get_dataframe(self):
        """
        Performs the entire scraping process and returns the DataFrame.

        Returns:
            pd.DataFrame: The DataFrame containing the scraped data.
        """
        self.session = requests_cache.CachedSession(backend='memory', expire_after=3600)
        self.fetch_html_content()
        self.parse_html_content()
        self.scrape_table_data()
        df = self.create_dataframe()
        return df


# Load configuration from file
config = ConfigLoader.load_config('config.ini')

# Create an instance of the TableScraper class with page number 0
scraper = TableScraper(0, config)

# Scrape the table and get the DataFrame
df = scraper.scrape_and_get_dataframe()

# Print the DataFrame
df.head()

Unnamed: 0,Date,Respondents,Release Numbers,Link
0,"June 2, 2023",Pending Administrative Proceedings (Order Vaca...,"33-11199, 34-97641, IA-6324, IC-34934, AAER-4414",https://www.sec.gov/files/litigation/opinions/...
1,"June 2, 2023",Pending Administrative Proceedings (Order Dism...,"33-11198, 34-97640, IA-6323, IC-34933, AAER-4413",https://www.sec.gov/files/litigation/opinions/...
2,"June 1, 2023","Carl S. Schwartz, CPA","IA-6320, AAER-4412",https://www.sec.gov/files/litigation/admin/202...
3,"May 26, 2023","Gartner, Inc.","34-97609, AAER-4411",https://www.sec.gov/files/litigation/admin/202...
4,"May 25, 2023","Jia Roger Qian Wang, CPA and Wang Certified Pu...","34-97590, AAER-4410",https://www.sec.gov/files/litigation/opinions/...
