In [1]:
import os
import json
import re
from datetime import datetime
from bs4 import BeautifulSoup


In [None]:
def extract_net_sales_and_fiscal_date(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        soup = BeautifulSoup(f, "lxml")

    # Normalize the entire text content
    full_text = soup.get_text(separator=' ', strip=True)
    full_text = re.sub(r'\s+', ' ', full_text)  # Normalize all whitespace (incl. non-breaking spaces)

    # Extract fiscal ending date
    fiscal_ending_date = None
    match = re.search(r"For the (quarterly|fiscal) period ended\s+([A-Za-z]+\s+\d{1,2},\s+\d{4})", full_text, re.IGNORECASE)
    if match:
        fiscal_ending_date = match.group(2).strip()

    # Extract net sales from tables
    # Extract net sales / revenue from flexible label match
    net_sales = None
    label_variants = [
        r"net sales",
        r"total net sales",
        r"net revenue",
        r"total net revenue",
        r"revenue",
        r"total revenues",
        r"gross profit",
        r"sales",
        r"total operating revenues",
        r"total revenues and other income"
    ]   

    for table in soup.find_all("table"):
        rows = table.find_all("tr")
        for row in rows:
            cells = row.find_all(["td", "th"])
            if not cells:
                continue

            label = cells[0].get_text(strip=True).lower()
            if any(re.fullmatch(variant, label) for variant in label_variants):
                # Try all other cells (in case there are multiple periods)
                for cell in cells[1:]:
                    text = cell.get_text(strip=True).replace("$", "").replace(",", "")
                    if re.fullmatch(r"\d+(\.\d+)?", text):  # Allow decimal numbers
                        net_sales = f"${float(text):,.0f}"  # Round to 0 decimals
                        break

            if net_sales:
                break
        if net_sales:
            break


    # return net_sales, fiscal_ending_date
    return net_sales, fiscal_ending_date or "Unknown"

In [None]:
# Example usage to extract one 10Q filling
file_path = "./full-submission.txt"
net_sales, fiscal_ending_date = extract_net_sales_and_fiscal_date(file_path)

print("Net Sales:", net_sales)
print("Fiscal Ending Date:", fiscal_ending_date)

In [None]:
def process_sec_filings(base_folder):
    summary_folder = os.path.join(base_folder, "summary")
    os.makedirs(summary_folder, exist_ok=True)

    for ticker in os.listdir(base_folder):
        ticker_path = os.path.join(base_folder, ticker, "10-Q")
        if not os.path.isdir(ticker_path):
            continue

        print(f"Processing {ticker}...")
        result = []
        for filing_folder in os.listdir(ticker_path):
            filing_path = os.path.join(ticker_path, filing_folder)
            txt_file = os.path.join(filing_path, "full-submission.txt")
            if os.path.exists(txt_file):
                net_sales, fiscal_date = extract_net_sales_and_fiscal_date(txt_file)
                result.append({
                    "filing_folder": filing_folder,
                    "net_sales": net_sales,
                    "fiscal_ending_date": fiscal_date
                })

        # Sort by fiscal_ending_date
        def parse_date(entry):
            try:
                return datetime.strptime(entry["fiscal_ending_date"], "%B %d, %Y")
            except:
                return datetime.max  # Push "Unknown" to the end

        result.sort(key=parse_date)

        # Save JSON summary
        out_path = os.path.join(summary_folder, f"{ticker}_10Q_summary.json")
        with open(out_path, "w", encoding="utf-8") as out_file:
            json.dump(result, out_file, indent=2)
        print(f"Saved summary to {out_path}")

In [None]:
# To generate and consolidate all 10Q fillings by ticker symbol

base_folder = "SEC-Filings"
process_sec_filings(base_folder)

Processing AAPL...
Saved summary to SEC-Filings\summary\AAPL_10Q_summary.json
Processing ABBV...
Saved summary to SEC-Filings\summary\ABBV_10Q_summary.json
Processing AMZN...
Saved summary to SEC-Filings\summary\AMZN_10Q_summary.json
Processing AVGO...
Saved summary to SEC-Filings\summary\AVGO_10Q_summary.json
Processing BAC...
Saved summary to SEC-Filings\summary\BAC_10Q_summary.json
Processing BACRP...
Saved summary to SEC-Filings\summary\BACRP_10Q_summary.json
Processing COST...
Saved summary to SEC-Filings\summary\COST_10Q_summary.json
Processing CRM...
Saved summary to SEC-Filings\summary\CRM_10Q_summary.json
Processing CVX...
Saved summary to SEC-Filings\summary\CVX_10Q_summary.json
Processing GOOG...
Saved summary to SEC-Filings\summary\GOOG_10Q_summary.json
Processing GOOGL...
Saved summary to SEC-Filings\summary\GOOGL_10Q_summary.json
Processing HD...
Saved summary to SEC-Filings\summary\HD_10Q_summary.json
Processing JNJ...
Saved summary to SEC-Filings\summary\JNJ_10Q_summary