# SEC 13F-Scraper

## Import 

In [1]:
#!pip install selenium

In [2]:
import os
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, StaleElementReferenceException
from datetime import datetime
from io import StringIO

In [3]:
def selector(driver, by, value, timeout=5):
    return WebDriverWait(driver, timeout).until(
        EC.presence_of_element_located((by, value))
    )

In [4]:
def scraper(cik, data):
    driver = webdriver.Chrome()
    os.makedirs('data_sec', exist_ok=True)

    date = datetime.now().strftime("%Y-%m-%d")
    path_csv = os.path.join('data_sec', f'{date}.csv')

    filed_updated = []

    for id, cik in enumerate(cik):
        try:
            cik_value = str(cik).zfill(10)
            id_value = data.loc[id, 'id']
            vip_value = data.loc[id, 'vip']
            name_value = data.loc[id, 'name']
            filed_old = data.loc[id, 'filed']

            url = f"https://www.sec.gov/edgar/search/#/q={cik_value}&filter_forms=13F-HR&sort=desc"
            driver.get(url)
            
            time.sleep(2)  # Short wait for initial load

            result_first = selector(driver, By.CSS_SELECTOR, "a.preview-file[data-adsh]")

            filed_new = selector(driver, By.XPATH, "/html/body/div[3]/div[2]/div[2]/table/tbody/tr[1]/td[2]").text
            quarter = selector(driver, By.XPATH, "/html/body/div[3]/div[2]/div[2]/table/tbody/tr[1]/td[3]").text

            # Compare filed dates
            if pd.notna(filed_old) and filed_old != '':
                date_old = datetime.strptime(filed_old, "%Y-%m-%d")
                date_new = datetime.strptime(filed_new, "%Y-%m-%d")
                print(f"CIK: {cik_value}, Existing date: {date_old}, New date: {date_new}")
                if date_new <= date_old:
                    print(f"No new filing for CIK {cik_value}. Skipping...")
                    filed_updated.append(filed_old)
                    continue  # Skip to the next CIK

            print(f"New filing found for CIK {cik_value}. Proceeding with scraping...")
            filed_updated.append(filed_new)

            result_first.click()

            open_filing_button = selector(driver, By.XPATH, "//button[contains(@class, 'btn-warning') and contains(text(), 'Open filing')]")
            open_filing_button.click()

            driver.switch_to.window(driver.window_handles[-1])

            # Implement the speedup method
            driver.set_page_load_timeout(3)
            try:
                driver.get(driver.current_url)
            except TimeoutException:
                driver.execute_script("window.stop();")

            target_link_xpath = "//table[@class='tableFile']/tbody/tr[4]/td[3]/a"
            target_link = WebDriverWait(driver, 5).until(
                EC.element_to_be_clickable((By.XPATH, target_link_xpath))
            )
            target_link.click()

            table = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.XPATH, "/html/body/table[2]"))
            )
            
            table_html = table.get_attribute('outerHTML')
            
            # Use StringIO to wrap the HTML string
            html_io = StringIO(table_html)
            df = pd.read_html(html_io)[0]

            df = df.iloc[3:].reset_index(drop=True)

            df.columns = [
                "company", "stock type", "cusip-id", "figi-id", "value", 
                "number", "principal amount", "call/put", "discretion", 
                "manager", "sole voting authority", "shared voting authority", 
                "none voting authority"
            ]

            df_input = pd.DataFrame({
                'id': id_value,
                'vip': vip_value,
                'name': name_value,
                'cik': cik_value,
                'filed': filed_new,
                'enddate': quarter
            }, index=df.index)

            df = pd.concat([df_input, df], axis=1)

            df.to_csv(path_csv, mode='a', header=not os.path.exists(path_csv), index=False)

        except Exception as e:
            print(f"An error occurred for CIK {cik_value}: {e}")
            filed_updated.append(filed_old)  # Keep the existing value in case of error

        finally:
            # Close all tabs except the first one
            while len(driver.window_handles) > 1:
                driver.switch_to.window(driver.window_handles[-1])
                driver.close()
            driver.switch_to.window(driver.window_handles[0])

    driver.quit()

    # Update test.csv with new filed values only after all scraping is done
    data['filed'] = filed_updated
    data.to_csv('data/test.csv', index=False)

In [5]:
def main():
    data = pd.read_csv('data/test.csv')
    df_cik = data['cik'].astype(str).tolist()
    scraper(df_cik, data)

if __name__ == "__main__":
    main()

CIK: 0001336528, Existing date: 2024-11-14 00:00:00, New date: 2024-11-14 00:00:00
No new filing for CIK 0001336528. Skipping...
CIK: 0001067983, Existing date: 2024-11-12 00:00:00, New date: 2024-11-14 00:00:00
New filing found for CIK 0001067983. Proceeding with scraping...


- Berechnung Quartalsabhängig machen
- Output überschreiben, nicht adden
- zweiten Scraper für zusatzinfos
- Error-Handling für 404 und no-results in Search