# SEC 13F-Scraper

## Import 

In [4]:
#!pip install selenium

In [21]:
import os
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

In [63]:

def scrape_sec_filings(cik_list):
    driver = webdriver.Chrome()  # Initialize the WebDriver

    # Create the data_sec directory if it doesn't exist
    os.makedirs('data_sec', exist_ok=True)

    for cik_number in cik_list:
        try:
            # Open the SEC EDGAR search page with the specific CIK number
            url = f"https://www.sec.gov/edgar/search/#/q={cik_number}&filter_forms=13F-HR&sort=desc"
            driver.get(url)

            # Wait for the first search result to be clickable and click it
            first_result = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, "a.preview-file[data-adsh]"))
            )
            first_result.click()

            # Wait for the popup to appear and click the "Open filing" button
            open_filing_button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.XPATH, "//button[contains(@class, 'btn-warning') and contains(text(), 'Open filing')]"))
            )
            open_filing_button.click()

            # Switch to the new tab that opens
            driver.switch_to.window(driver.window_handles[-1])

            # Wait for the table to load
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CLASS_NAME, "tableFile"))
            )

            # Find and click the link in the 4th row, 3rd column
            target_link = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.XPATH, "//table[@class='tableFile']/tbody/tr[4]/td[3]/a"))
            )
            target_link.click()

            # Wait for the document to load and scrape its content
            document_content = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.TAG_NAME, "body"))
            ).text

            # Save the scraped content to a CSV file named after the CIK
            csv_file_path = os.path.join('data_sec', f'{cik_number}.csv')
            with open(csv_file_path, 'w', encoding='utf-8') as f:
                f.write(document_content)

            print(f"Content for CIK {cik_number} saved to {csv_file_path}")

        except Exception as e:
            print(f"An error occurred for CIK {cik_number}: {e}")

    # Close the browser after scraping
    driver.quit()


In [64]:
def main():
    # Read CIK numbers from CSV file
    df = pd.read_csv('data/test.csv')  # Change to investors_cik.csv later
    df_cik = df['cik'].tolist()  # Extract CIK column into a list

    # Call the scraping function with the list of CIKs
    scrape_sec_filings(df_cik)

if __name__ == "__main__":
    main()

Content for CIK 1067983 saved to data_sec/1067983.csv


In [None]:
def scrape_sec_filings(cik_list):
    driver = webdriver.Chrome()  # Initialize the WebDriver

    # Create the data_sec directory if it doesn't exist
    os.makedirs('data_sec', exist_ok=True)

    for cik_number in cik_list:
        try:
            # Open the SEC EDGAR search page with the specific CIK number
            url = f"https://www.sec.gov/edgar/search/#/q={cik_number}&filter_forms=13F-HR&sort=desc"
            driver.get(url)

            # Wait for the first search result to be clickable and click it
            first_result = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, "a.preview-file[data-adsh]"))
            )
            first_result.click()

            # Wait for the popup to appear and click the "Open filing" button
            open_filing_button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.XPATH, "//button[contains(@class, 'btn-warning') and contains(text(), 'Open filing')]"))
            )
            open_filing_button.click()

            # Switch to the new tab that opens
            driver.switch_to.window(driver.window_handles[-1])

            # Wait for the table to load on this page
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CLASS_NAME, "tableFile"))
            )

            # Find and click the link in the 4th row, 3rd column
            target_link = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.XPATH, "//table[@class='tableFile']/tbody/tr[4]/td[3]/a"))
            )
            target_link.click()

            # Wait for the document page to load (the last page)
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.TAG_NAME, "body"))
            )

            # Locate the table with summary="Form 13F-NT Header Information"
            second_table = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.XPATH, "//table[@summary='Form 13F-NT Header Information']"))
            )

            # Extract table rows
            rows = second_table.find_elements(By.TAG_NAME, "tr")

            # Prepare data for CSV
            table_data = []
            for row in rows:
                cols = row.find_elements(By.TAG_NAME, "td")
                cols = [col.text.strip() for col in cols]  # Get text from each column and strip whitespace
                if cols:  # Only add non-empty rows
                    table_data.append(cols)

            # Save the scraped content to a CSV file named after the CIK
            csv_file_path = os.path.join('data_sec', f'{cik_number}.csv')
            
            with open(csv_file_path, 'w', encoding='utf-8') as f:
                for row in table_data:
                    f.write(','.join(row) + '\n')  # Join columns with commas and write to file

            print(f"Content for CIK {cik_number} saved to {csv_file_path}")

        except Exception as e:
            print(f"An error occurred for CIK {cik_number}: {e}")

    # Close the browser after scraping
    driver.quit()

def main():
    # Read CIK numbers from CSV file
    df = pd.read_csv('data/test.csv')  # Ensure this file is in the same directory
    df_cik = df['cik'].tolist()  # Extract CIK column into a list

    # Call the scraping function with the list of CIKs
    scrape_sec_filings(df_cik)

if __name__ == "__main__":
    main()
