# SEC 13F-Scraper

## Import 

In [114]:
#!pip install selenium

In [1]:
import os
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, StaleElementReferenceException
from datetime import datetime
from io import StringIO

In [2]:
def selector(driver, by, value, timeout=5):
    return WebDriverWait(driver, timeout).until(
        EC.presence_of_element_located((by, value))
    )

In [None]:
def get_quarter(date):
    month = date.month
    year = date.year
    if month in [4, 5, 6]:
        return "q1", year
    elif month in [7, 8, 9]:
        return "q2", year
    elif month in [10, 11, 12]:
        return "q3", year
    elif month in [1, 2, 3]:
        return "q4", year - 1 

In [4]:
def calculate_share(scraped_count, total_count):
    if total_count == 0:
        return 0
    share = (scraped_count / total_count) * 100
    return round(share)  

In [28]:
def scraper(cik, data, today_ignore='no'):
    driver = webdriver.Chrome()
    os.makedirs('data_sec', exist_ok=True)

    # Get current date and calculate quarter and adjusted year
    date_current = datetime.now()
    date_str = date_current.strftime("%Y-%m-%d")
    quarter, year = get_quarter(date_current)  # Use renamed function here
    
    # Initialize counters for share calculation
    entries_total = len(data)  # Total entries in the dataset
    entries_scraped = 0  # Entries successfully processed (scraped or skipped due to today's date)

    # Placeholder for temporary CSV file path (only created when valid entries are scraped)
    path_csv_temp = None

    filed_updated = []
    entries_error = []  # To store rows where errors occurred

    for id in range(len(data)):
        cik_value = str(data.loc[id, 'cik']).zfill(10)
        filed_old = data.loc[id, 'filed']

        # Skip rows with today's date if today_ignore is 'yes'
        if today_ignore.lower() == 'yes' and filed_old == date_str:
            print(f"CIK {cik_value}: Entry has today's date ({filed_old}). Skipping ...")
            filed_updated.append(filed_old)  # Keep the old filed value
            entries_scraped += 1  # Increment the counter since it's considered processed
            continue

        try:
            id_value = data.loc[id, 'id']
            vip_value = data.loc[id, 'vip']
            name_value = data.loc[id, 'name']

            url = f"https://www.sec.gov/edgar/search/#/q={cik_value}&filter_forms=13F-HR&sort=desc"
            driver.get(url)
            
            time.sleep(2)  # Short wait for initial load

            try:
                result_first = selector(driver, By.CSS_SELECTOR, "a.preview-file[data-adsh]")
            except TimeoutException:
                print(f"CIK {cik_value}: No search results found. Skipping ...")
                filed_updated.append(filed_old)  # Keep the old filed value if no results are found
                entries_error.append({'id': id_value, 'vip': vip_value, 'name': name_value, 'cik': cik_value})
                continue

            filed_new = selector(driver, By.XPATH, "/html/body/div[3]/div[2]/div[2]/table/tbody/tr[1]/td[2]").text
            reported_new = selector(driver, By.XPATH, "/html/body/div[3]/div[2]/div[2]/table/tbody/tr[1]/td[3]").text

            # Compare filed dates
            if pd.notna(filed_old) and filed_old != '':
                date_old = datetime.strptime(filed_old, "%Y-%m-%d")
                date_new = datetime.strptime(filed_new, "%Y-%m-%d")
                if date_new <= date_old:
                    print(f"CIK {cik_value}: No new filing. Skipping ...")
                    filed_updated.append(filed_old)
                    entries_scraped += 1  # Increment the counter since it's considered processed
                    continue  # Skip to the next CIK

            print(f"CIK {cik_value}: New filing found. Scraping ...")
            filed_updated.append(filed_new)

            result_first.click()

            open_filing_button = selector(driver, By.XPATH, "//button[contains(@class, 'btn-warning') and contains(text(), 'Open filing')]")
            open_filing_button.click()

            driver.switch_to.window(driver.window_handles[-1])

            # Implement the speedup method
            driver.set_page_load_timeout(3)
            try:
                driver.get(driver.current_url)
            except TimeoutException:
                driver.execute_script("window.stop();")

            target_link_xpath = "//table[@class='tableFile']/tbody/tr[4]/td[3]/a"
            target_link = WebDriverWait(driver, 5).until(
                EC.element_to_be_clickable((By.XPATH, target_link_xpath))
            )
            target_link.click()

            table = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.XPATH, "/html/body/table[2]"))
            )
            
            table_html = table.get_attribute('outerHTML')
            
            # Use StringIO to wrap the HTML string
            html_io = StringIO(table_html)
            df = pd.read_html(html_io)[0]

            df.columns = [
                "company", "stock type", "cusip-id", "figi-id", "value", 
                "number", "principal amount", "call/put", "discretion", 
                "manager", "sole voting authority", "shared voting authority", 
                "none voting authority"
            ]

            df_input = pd.DataFrame({
                'id': id_value,
                'vip': vip_value,
                'name': name_value,
                'cik': cik_value,
                'filed': filed_new,
                'enddate': reported_new
            }, index=df.index)

            df = pd.concat([df_input, df], axis=1)

            # Create temporary file path only when valid entries are scraped
            if path_csv_temp is None:
                path_csv_temp = os.path.join('data_sec', f'{year}-{quarter}_temp_{date_str}.csv')

            df.to_csv(path_csv_temp, mode='a', header=not os.path.exists(path_csv_temp), index=False)

            # Increment scraped entries count
            entries_scraped += 1

        except Exception as e:
            print(f"CIK {cik_value}: An error occurred! {e}")
            
            # Append details of the entry with an error to entries_error list
            id_value = data.loc[id, 'id']
            vip_value = data.loc[id, 'vip']
            name_value = data.loc[id, 'name']
            
            entries_error.append({'id': id_value, 'vip': vip_value, 'name': name_value, 'cik': cik_value})
            
            filed_updated.append(filed_old)  # Keep the existing value in case of error

        finally:
            # Close all tabs except the first one
            while len(driver.window_handles) > 1:
                driver.switch_to.window(driver.window_handles[-1])
                driver.close()
            driver.switch_to.window(driver.window_handles[0])

    driver.quit()

    if entries_scraped > 0 and path_csv_temp is not None:
        # Calculate share and rename file with share percentage
        share = calculate_share(entries_scraped, entries_total)
        path_csv_finalized = os.path.join('data_sec', f'{year}-{quarter}_{share}%_{date_str}.csv')
        os.rename(path_csv_temp, path_csv_finalized)
        print(f"Data saved to {path_csv_finalized}")
    else:
        print("No entries scraped. No file created.")

    # Save error entries to a separate CSV file if any errors occurred
    if entries_error:
        complement_share = round(100 - calculate_share(entries_scraped, entries_total))  # Complement of share (as percentage)
        error_file_path = os.path.join('data_sec', f'error_{year}-{quarter}_{complement_share}%_{date_str}.csv')
        pd.DataFrame(entries_error).to_csv(error_file_path, index=False)
        print(f"Error details saved to {error_file_path}")

    # Update test.csv with new filed values only after all scraping is done
    data['filed'] = filed_updated
    data.to_csv('data/investors_cik.csv', index=False)

In [35]:
def main():
    data = pd.read_csv('data/investors_cik.csv')
    df_cik = data['cik'].astype(str).tolist()
    
    today_ignore = 'yes'
    
    scraper(df_cik,data,today_ignore)

if __name__ == "__main__":
    main()

CIK 0001336528: No new filing. Skipping ...
CIK 0001067983: No new filing. Skipping ...
CIK 0001096343: No new filing. Skipping ...
CIK 0000859804: No new filing. Skipping ...
CIK 0000807985: No new filing. Skipping ...
CIK 0001035674: No new filing. Skipping ...
CIK 0000915191: No new filing. Skipping ...
CIK 0000883965: New filing found. Scraping ...
Data saved to data_sec/2024-q4_100%_2025-01-07.csv


- Berechnung Quartalsabhängig machen
- zweiten Scraper für zusatzinfos



This is the code that checks for last quarter

In [None]:
def scraper(cik, data):
    driver = webdriver.Chrome()
    os.makedirs('data_sec', exist_ok=True)

    current_date = datetime.now()
    date_str = current_date.strftime("%Y-%m-%d")
    quarter, year = quarter(current_date)
    
    path_csv = os.path.join('data_sec', f'{year}-{quarter}_{date_str}.csv')

    filed_updated = []

    for id, cik in enumerate(cik):
        try:
            cik_value = str(cik).zfill(10)
            id_value = data.loc[id, 'id']
            vip_value = data.loc[id, 'vip']
            name_value = data.loc[id, 'name']
            filed_old = data.loc[id, 'filed']

            url = f"https://www.sec.gov/edgar/search/#/q={cik_value}&filter_forms=13F-HR&sort=desc"
            driver.get(url)
            
            time.sleep(2)  # Short wait for initial load

            result_first = selector(driver, By.CSS_SELECTOR, "a.preview-file[data-adsh]")

            filed_new = selector(driver, By.XPATH, "/html/body/div[3]/div[2]/div[2]/table/tbody/tr[1]/td[2]").text
            quarter_new = selector(driver, By.XPATH, "/html/body/div[3]/div[2]/div[2]/table/tbody/tr[1]/td[3]").text

            # Compare filed dates
            if pd.notna(filed_old) and filed_old != '':
                # Parse old and new filed dates
                date_old = datetime.strptime(filed_old, "%Y-%m-%d")
                date_new = datetime.strptime(filed_new, "%Y-%m-%d")

                # Determine current quarter and year
                date_current = datetime.now()
                quarter_current, year_current = quarter(date_current)

                # Determine quarter and year of date_old
                quarter_old, year_old = quarter(date_old)

                # Skip if date_old is from a previous quarter
                if (year_old < year_current or quarter_old != quarter_current):
                    print(f"CIK {cik_value}: Filing is from a previous quarter ({quarter_old}, {year_old}). Skipping...")
                    filed_updated.append(filed_old)
                    continue  # Skip to the next CIK

                # Skip if date_new is not newer than date_old
                if date_new <= date_old:
                    print(f"CIK {cik_value}: No new filing in the current quarter. Skipping...")
                    filed_updated.append(filed_old)
                    continue  # Skip to the next CIK

            print(f"New filing found for CIK {cik_value}. Proceeding with scraping...")
            filed_updated.append(filed_new)

            result_first.click()

            open_filing_button = selector(driver, By.XPATH, "//button[contains(@class, 'btn-warning') and contains(text(), 'Open filing')]")
            open_filing_button.click()

            driver.switch_to.window(driver.window_handles[-1])

            driver.set_page_load_timeout(3)
            try:
                driver.get(driver.current_url)
            except TimeoutException:
                driver.execute_script("window.stop();")

            target_link_xpath = "//table[@class='tableFile']/tbody/tr[4]/td[3]/a"
            target_link = WebDriverWait(driver, 5).until(
                EC.element_to_be_clickable((By.XPATH, target_link_xpath))
            )
            target_link.click()

            table = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.XPATH, "/html/body/table[2]"))
            )
            
            table_html = table.get_attribute('outerHTML')
            
            html_io = StringIO(table_html)
            df = pd.read_html(html_io)[0]

            df.columns = [
                "company", "stock type", "cusip-id", "figi-id", "value", 
                "number", "principal amount", "call/put", "discretion", 
                "manager", "sole voting authority", "shared voting authority", 
                "none voting authority"
            ]

            df_input = pd.DataFrame({
                'id': id_value,
                'vip': vip_value,
                'name': name_value,
                'cik': cik_value,
                'filed': filed_new,
                'enddate': quarter_new
            }, index=df.index)

            df.to_csv(path_csv, mode='a', header=not os.path.exists(path_csv), index=False)

        except Exception as e:
            print(f"An error occurred for CIK {cik_value}: {e}")
            filed_updated.append(filed_old)  # Keep existing value in case of error

        finally:
            while len(driver.window_handles) > 1:
                driver.switch_to.window(driver.window_handles[-1])
                driver.close()
            driver.switch_to.window(driver.window_handles[0])

    driver.quit()

    data['filed'] = filed_updated
    data.to_csv('data/test.csv', index=False)



In [None]:
1	Bill Ackman	Pershing Square Capital Management	1336528
2	Warren Buffett	Berkshire Hathaway	1067983
3	Tom Gayner	Markel Corporation	1096343
4	Tony Guerrerio and David Rolfe	Wedgewood Partners	859804
5	Mason Hawkins	Southeastern Asset Management	807985
6	John Paulson	Paulson & Co.	1035674
7	Prem Watsa	Fairfax Financial Holdings	915191
8	Wally Weitz	Wallace R. Weitz & Company	883965
