<a href="https://colab.research.google.com/github/ltejadavic/EdgarDB/blob/main/EdgarDB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
from bs4 import BeautifulSoup
import re
import csv


def fetch_metadata(cik):
    """
    Fetch metadata for the given CIK from the SEC submissions API.
    Args:
        cik (str): Central Index Key (CIK) of the entity.
    Returns:
        dict: JSON metadata of the filings for the entity.
    """
    url = f"https://data.sec.gov/submissions/CIK{cik}.json"
    headers = {'User-Agent': "getbagsfinance@gmail.com"}  # Required user-agent header for SEC API
    response = requests.get(url, headers=headers)
    print(f"Fetched data from {url}")
    return response.json()  # Return the JSON metadata


def get_10k_urls(metadata):
    """
    Extract 10-K filing URLs from the metadata JSON for years 2020-2024.
    Args:
        metadata (dict): JSON metadata of the filings for the entity.
    Returns:
        list: List of tuples containing filing URLs and their corresponding years.
    """
    urls = []
    recent_filings = metadata.get('filings', {}).get('recent', {})
    for i, form_type in enumerate(recent_filings.get('form', [])):
        if form_type == '10-K':  # Only consider 10-K filings
            accession_number = recent_filings['accessionNumber'][i].replace('-', '')  # Clean accession number
            primary_document = recent_filings['primaryDocument'][i]  # Primary document file name
            cik = metadata['cik']
            year = int(recent_filings['filingDate'][i][:4])  # Extract the year from the filing date
            if 2020 <= year <= 2024:  # Filter filings between 2020 and 2024
                filing_url = f"https://www.sec.gov/Archives/edgar/data/{cik}/{accession_number}/{primary_document}"
                urls.append((filing_url, year))  # Append the URL and year
    return urls


def fetch_filing_html(url):
    """
    Fetch the HTML content of a filing from the provided URL.
    Args:
        url (str): URL of the filing.
    Returns:
        bytes: HTML content of the filing.
    """
    headers = {'User-Agent': "getbagsfinance@gmail.com"}  # Required user-agent header for SEC API
    response = requests.get(url, headers=headers)
    return response.content  # Return the HTML content


def extract_item_1a_section(html_content):
    """
    Extract the ITEM 1A section from the HTML content using BeautifulSoup.
    Args:
        html_content (bytes): HTML content of the filing.
    Returns:
        str: Extracted text of the ITEM 1A section, or None if not found.
    """
    soup = BeautifulSoup(html_content, 'html.parser')
    item_1a = soup.find(string=re.compile(r'ITEM\s*1A[^A-Za-z]*Risk Factors', re.IGNORECASE))  # Locate the "ITEM 1A" header

    if not item_1a:
        print("ITEM 1A not found.")
        return None

    # Extract text from ITEM 1A to the next section header
    section_text = []
    for sibling in item_1a.find_all_next(string=True):
        if re.search(r'(ITEM\s*\d+\s|PART\s*II)', sibling, re.IGNORECASE):  # Stop at the next section or part
            break
        section_text.append(sibling)

    return ' '.join(section_text).strip()  # Join all extracted lines into a single string


def search_keywords(section_text, keywords):
    """
    Search for keywords in the extracted section and return sentences containing them.
    Args:
        section_text (str): Text content of the ITEM 1A section.
        keywords (list): List of keywords to search for.
    Returns:
        list: List of sentences containing the keywords.
    """
    sentences = re.split(r'(?<=[.!?]) +', section_text)  # Split text into sentences
    related_sentences = [s for s in sentences if any(keyword.lower() in s.lower() for keyword in keywords)]  # Filter sentences containing keywords
    return related_sentences


if __name__ == "__main__":
    # List of CIKs to process
    ciks = ['0000789019', '0000910521', '0000772406', '0001443646']

    # Keywords to search for in the ITEM 1A section
    covid_keywords = ["COVID-19", "pandemic", "Corona"]

    # List to store results
    results = []

    # Loop through each CIK
    for cik in ciks:
        metadata = fetch_metadata(cik)  # Fetch metadata for the CIK
        filing_urls = get_10k_urls(metadata)  # Get 10-K URLs for the entity

        # Process each filing URL
        for url, year in filing_urls:
            html_content = fetch_filing_html(url)  # Fetch the HTML content of the filing
            item_1a_text = extract_item_1a_section(html_content)  # Extract the ITEM 1A section

            if item_1a_text:  # If the section is found
                total_words = len(item_1a_text.split())  # Count the total words in the section
                covid_sentences = search_keywords(item_1a_text, covid_keywords)  # Search for COVID-related sentences

                # Append the results
                results.append({
                    "CIK": cik,
                    "Year": year,
                    "Total Words in ITEM 1A": total_words,
                    "Corona Related Sentences": " | ".join(covid_sentences) if covid_sentences else ""
                })

    # Write the results to a CSV file
    csv_file = "covid_mentions_item1a.csv"
    with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=["CIK", "Year", "Total Words in ITEM 1A", "Corona Related Sentences"])
        writer.writeheader()  # Write the CSV header
        writer.writerows(results)  # Write the data rows

    print(f"Results saved to {csv_file}")

Fetched data from https://data.sec.gov/submissions/CIK0000789019.json
ITEM 1A not found.
ITEM 1A not found.
Fetched data from https://data.sec.gov/submissions/CIK0000910521.json
Fetched data from https://data.sec.gov/submissions/CIK0000772406.json
Fetched data from https://data.sec.gov/submissions/CIK0001443646.json
Results saved to covid_mentions_item1a.csv
