In [10]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Fetch the HTML content
url = "https://www.sec.gov/divisions/enforce/friactions"
response = requests.get(url)
html_content = response.content

# Parse the HTML content
soup = BeautifulSoup(html_content, 'html.parser')

# Find the table element
table = soup.find('table', class_='list')

# Extract table data and store it in a list of dictionaries
data = []
for row in table.find_all('tr', class_='pr-list-page-row'):
    date = row.find('time', class_='datetime').text.strip()
    
    respondents_element = row.find('div', class_='release-view__respondents')
    respondents = respondents_element.text.strip() if respondents_element else None
    
    release_numbers_element = row.find('div', class_='view-table_subfield view-table_subfield_release_number')
    release_numbers = release_numbers_element.text.replace('Release No.', '').strip() if release_numbers_element else None
    
    link_element = respondents_element.find('a')
    link = link_element['href'] if link_element else None
    
    data.append({'Date': date, 'Respondents': respondents, 'Release Numbers': release_numbers, 'Link': link})

# Create a DataFrame from the scraped data
df = pd.DataFrame(data)

# Print the DataFrame
df


Unnamed: 0,Date,Respondents,Release Numbers,Link
0,"June 2, 2023",Pending Administrative Proceedings (Order Vaca...,"33-11199, 34-97641, IA-6324, IC-34934, AAER-4414",https://www.sec.gov/files/litigation/opinions/...
1,"June 2, 2023",Pending Administrative Proceedings (Order Dism...,"33-11198, 34-97640, IA-6323, IC-34933, AAER-4413",https://www.sec.gov/files/litigation/opinions/...
2,"June 1, 2023","Carl S. Schwartz, CPA","IA-6320, AAER-4412",https://www.sec.gov/files/litigation/admin/202...
3,"May 26, 2023","Gartner, Inc.","34-97609, AAER-4411",https://www.sec.gov/files/litigation/admin/202...
4,"May 25, 2023","Jia Roger Qian Wang, CPA and Wang Certified Pu...","34-97590, AAER-4410",https://www.sec.gov/files/litigation/opinions/...
...,...,...,...,...
95,"Aug. 3, 2022",Brian K. Hutchison,"LR-25463, AAER-4319",/litigation/litreleases/lr-25463
96,"Aug. 3, 2022","Surgalign Holdings, Inc. and Robert P. Jordheim","33-11088, 34-95415, AAER-4318",https://www.sec.gov/files/litigation/admin/202...
97,"Aug. 3, 2022","Anton & Chia, LLP Wahl, Gregory A., CPA Deutch...","34-95413, AAER-4317",https://www.sec.gov/files/litigation/opinions/...
98,"July 8, 2022","Arcangelo Loberto, CPA and CA","34-95224, AAER-4316",https://www.sec.gov/files/litigation/admin/202...


In [21]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin

class TableScraper:
    def __init__(self, page):
        """
        Initializes the TableScraper class.

        Args:
            page (int): The page number to scrape.
        """
        self.base_url = "https://www.sec.gov/divisions/enforce/friactions"
        self.page = page
        self.url = self.construct_url()
        self.data = []

    def construct_url(self):
        """
        Constructs the URL for scraping based on the page number.

        Returns:
            str: The constructed URL.
        """
        params = {"page": self.page}
        url = urljoin(self.base_url, "?".join([self.base_url, "&".join(f"{k}={v}" for k, v in params.items())]))
        return url

    def fetch_html_content(self):
        """
        Fetches the HTML content of the webpage.
        """
        response = requests.get(self.url)
        self.html_content = response.content

    def parse_html_content(self):
        """
        Parses the HTML content using BeautifulSoup.
        """
        soup = BeautifulSoup(self.html_content, 'html.parser')
        self.table = soup.find('table', class_='list')

    def scrape_table_data(self):
        """
        Scrapes the table data and stores it in a list of dictionaries.
        """
        for row in self.table.find_all('tr', class_='pr-list-page-row'):
            date = row.find('time', class_='datetime').text.strip()

            respondents_element = row.find('div', class_='release-view__respondents')
            respondents = respondents_element.text.strip() if respondents_element else None

            release_numbers_element = row.find('div', class_='view-table_subfield view-table_subfield_release_number')
            release_numbers = release_numbers_element.text.replace('Release No.', '').strip() if release_numbers_element else None

            link_element = respondents_element.find('a')
            link = link_element['href'] if link_element else None

            self.data.append({'Date': date, 'Respondents': respondents, 'Release Numbers': release_numbers, 'Link': link})

    def create_dataframe(self):
        """
        Creates a DataFrame from the scraped data.

        Returns:
            pd.DataFrame: The DataFrame containing the scraped data.
        """
        df = pd.DataFrame(self.data)
        return df

    def scrape_and_get_dataframe(self):
        """
        Performs the entire scraping process and returns the DataFrame.

        Returns:
            pd.DataFrame: The DataFrame containing the scraped data.
        """
        self.fetch_html_content()
        self.parse_html_content()
        self.scrape_table_data()
        df = self.create_dataframe()
        return df


# Create an instance of the TableScraper class with page number 0
scraper = TableScraper(0)

# Scrape the table and get the DataFrame
df = scraper.scrape_and_get_dataframe()

# Print the DataFrame
df


Unnamed: 0,Date,Respondents,Release Numbers,Link
0,"June 2, 2023",Pending Administrative Proceedings (Order Vaca...,"33-11199, 34-97641, IA-6324, IC-34934, AAER-4414",https://www.sec.gov/files/litigation/opinions/...
1,"June 2, 2023",Pending Administrative Proceedings (Order Dism...,"33-11198, 34-97640, IA-6323, IC-34933, AAER-4413",https://www.sec.gov/files/litigation/opinions/...
2,"June 1, 2023","Carl S. Schwartz, CPA","IA-6320, AAER-4412",https://www.sec.gov/files/litigation/admin/202...
3,"May 26, 2023","Gartner, Inc.","34-97609, AAER-4411",https://www.sec.gov/files/litigation/admin/202...
4,"May 25, 2023","Jia Roger Qian Wang, CPA and Wang Certified Pu...","34-97590, AAER-4410",https://www.sec.gov/files/litigation/opinions/...
...,...,...,...,...
95,"Aug. 3, 2022",Brian K. Hutchison,"LR-25463, AAER-4319",/litigation/litreleases/lr-25463
96,"Aug. 3, 2022","Surgalign Holdings, Inc. and Robert P. Jordheim","33-11088, 34-95415, AAER-4318",https://www.sec.gov/files/litigation/admin/202...
97,"Aug. 3, 2022","Anton & Chia, LLP Wahl, Gregory A., CPA Deutch...","34-95413, AAER-4317",https://www.sec.gov/files/litigation/opinions/...
98,"July 8, 2022","Arcangelo Loberto, CPA and CA","34-95224, AAER-4316",https://www.sec.gov/files/litigation/admin/202...
