# Retrieval of All Content from SEC Filings to Support RAG and Summarization

## Initial Setup

Import dependencies

In [2]:
from __future__ import annotations
import json
import re
import time
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional, Tuple, Literal
import calendar
from datetime import datetime
import pandas as pd
import requests
from bs4 import BeautifulSoup

Define the user agent details for the SEC API

In [3]:
USER_AGENT = "AirlineFinancialDashboard.com (michael.tricanowicz@live.com)"

Define functions to create period start and end date windows to filter filing retrieval

In [4]:
# Define function to create start and end dates
def define_period_dates(year: int, period: str) -> Tuple[datetime, datetime]:
    """
    period: 'FY' or 'Q1'/'Q2'/'Q3'/'Q4'
    Returns: (start_date, end_date) as datetimes covering that period.
    """
    if period == "FY":
        start_month = 1
        end_month = 12
    else:
        end_month = int(period[-1]) * 3
        start_month = end_month - 2
    start_day = 1
    end_day = calendar.monthrange(year, end_month)[1]
    # Create start and end date variables to constrain document scraping
    start_date = datetime(year, start_month, start_day)
    end_date = datetime(year, end_month, end_day)
    return start_date, end_date

# Define function to build date windows with labels when multiple years and/or periods are needed
def build_date_windows(years: Iterable[int], periods: Iterable[str]) -> List[Tuple[str, datetime, datetime]]:
    """
    Create a list of windows with labels, e.g.:
    [('2024-Q1', start, end), ('2024-FY', start, end), ...]
    """
    windows = []
    for y in years:
        for p in periods:
            s, e = define_period_dates(y, p)
            windows.append((f"{y}-{p}", s, e))
    # Sort for stability / readability
    windows.sort(key=lambda x: (x[1], x[0]))
    return windows

Define functions to create maps of airlines of interest and their CIKs for filing retrieval

In [5]:
# Define function to fetch ticker to CIK mapping
def fetch_ticker_to_cik_map(user_agent: str) -> Dict[str, str]:
    """
    Download SEC's company_tickers.json ONCE and build {ticker -> 10-digit CIK}.

    Why:
    - Avoids repeated downloads per ticker
    - Faster, more polite, fewer rate-limit issues
    """
    url = "https://www.sec.gov/files/company_tickers.json"
    headers = {"User-Agent": user_agent}
    resp = requests.get(url, headers=headers, timeout=30)
    resp.raise_for_status()
    data = resp.json()
    ticker_to_cik = {}
    for item in data.values():
        ticker = item["ticker"].upper()
        cik10 = str(item["cik_str"]).zfill(10)
        ticker_to_cik[ticker] = cik10
    return ticker_to_cik

# Define function to build airline CIK dictionary
def build_airline_cik_dict(airlines: List[str], ticker_to_cik: Dict[str, str]) -> Dict[str, str]:
    """
    Convert a list of tickers into {ticker: cik10}. Skips unknown tickers.
    """
    out = {}
    for t in airlines:
        t_up = t.upper()
        cik = ticker_to_cik.get(t_up)
        if cik:
            out[t_up] = cik
    return out

# Fetch and define the ticker mapping dictionary
ticker_map = fetch_ticker_to_cik_map(USER_AGENT)

## Financial Performance Summarization

**Download Complete Filings (text + tables) via Submissions → Archives index.json → download docs**<br>

For each company:
- Use Submissions JSON to list filings and get the accession number + primary document name.
- For each accession, go to the Archives folder and fetch index.json to enumerate all documents in the submission.

Download:
- primaryDocument (often .htm)
- {accession}.txt (“complete submission text file”) when available
- optionally: XBRL instance files (.xml, .xsd, .cal, .lab, .pre) and other HTML exhibits

Parse the primary HTML into:
- a cleaned narrative text file for embedding
- a set of tables exported to CSV

Why this approach:
It’s the most reliable way to capture full narrative + tables (beyond just XBRL facts).

### **Helpers + SEC client Setup**

In [6]:
# -----------------------------
# Helper functions
# -----------------------------
def normalize_cik(cik: str | int) -> str:
    """
    Convert CIK to a zero-padded 10-digit string.

    Why:
    - SEC endpoints typically want 'CIK##########' where the CIK is 10 digits.
    - Users commonly provide CIKs with fewer digits or with formatting.
    """
    s = re.sub(r"[^\d]", "", str(cik))  # keep digits only
    return s.zfill(10)

def cik_archives_path(cik: str | int) -> str:
    """
    Convert CIK to the Archives path form.

    Why:
    - Archives URLs use the integer form of the CIK (no leading zeros).
      Example: CIK 0000320193 becomes '320193' in /edgar/data/320193/
    """
    return str(int(normalize_cik(cik)))

def accession_nodashes(accession: str) -> str:
    """
    Archives folder uses accession with dashes removed.

    Example:
    - '0000320193-24-000123' -> '000032019324000123'
    """
    return accession.replace("-", "").strip()

def ensure_dir(p: Path) -> None:
    """Create a directory (and parents) if it doesn't exist."""
    p.mkdir(parents=True, exist_ok=True)

def parse_yyyy_mm_dd(s: Optional[str]) -> Optional[datetime]:
    if not s:
        return None
    try:
        return datetime.strptime(s, "%Y-%m-%d")
    except ValueError:
        return None

def match_windows(
    dt: datetime,
    windows: List[Tuple[str, datetime, datetime]],
) -> List[str]:
    """
    Return labels of all windows that contain dt.
    (Usually 0 or 1, but could be >1 if you have overlapping windows.)
    """
    labels = []
    for label, start, end in windows:
        if start <= dt <= end:
            labels.append(label)
    return labels

# -----------------------------
# SEC client with:
# - descriptive User-Agent
# - polite throttling (avoid hammering SEC)
# - retry with backoff on transient errors
# -----------------------------
@dataclass
class SecClient:
    """
    Minimal SEC HTTP client.

    Why a custom client:
    - The SEC expects a descriptive User-Agent.
    - You want consistent throttling and retry logic across all requests.
    - You want one Session (connection reuse) for speed and politeness.
    """
    user_agent: str
    min_interval_sec: float = 0.15  # ~6-7 requests/sec. Safe under typical guidance.
    timeout: int = 30
    max_retries: int = 5

    # SEC endpoints we will use
    base_submissions: str = "https://data.sec.gov/submissions"
    base_xbrl: str = "https://data.sec.gov/api/xbrl"
    base_archives: str = "https://www.sec.gov/Archives"

    def __post_init__(self) -> None:
        # Use a requests.Session for connection pooling
        self.sess = requests.Session()

        # User-Agent is REQUIRED by SEC fair-access policy; include contact.
        self.sess.headers.update({
            "User-Agent": self.user_agent,
            "Accept-Encoding": "gzip, deflate",
            "Accept": "application/json,text/html,text/plain,*/*",
        })

        # Track last request time to enforce min interval between requests
        self._last_ts = 0.0

    def _throttle(self) -> None:
        """
        Enforce a minimum time gap between requests.

        Why:
        - Keeps you compliant with SEC fair access expectations.
        - Reduces 429 "Too Many Requests" errors.
        """
        dt = time.time() - self._last_ts
        if dt < self.min_interval_sec:
            time.sleep(self.min_interval_sec - dt)

    def get(self, url: str, *, stream: bool = False) -> requests.Response:
        """
        GET with throttling + retry + exponential backoff.

        Retries for:
        - 429 rate limit
        - 5xx server errors
        """
        for attempt in range(1, self.max_retries + 1):
            self._throttle()
            try:
                r = self.sess.get(url, timeout=self.timeout, stream=stream)
                self._last_ts = time.time()

                # Retry on common transient failures
                if r.status_code in (429, 500, 502, 503, 504):
                    time.sleep(min(2 ** attempt, 30))
                    continue

                r.raise_for_status()
                return r

            except requests.RequestException:
                if attempt == self.max_retries:
                    raise
                time.sleep(min(2 ** attempt, 30))

        raise RuntimeError("Unreachable: exceeded retry loop")

    # -----------------------------
    # Submissions endpoint utilities
    # -----------------------------
    def submissions(self, cik: str | int) -> Dict[str, Any]:
        """
        Retrieve the company submissions JSON:
        https://data.sec.gov/submissions/CIK##########.json

        Why:
        - This is the best "filing catalog" for a company:
          lists accession numbers, forms, dates, and primary document names.
        """
        cik10 = normalize_cik(cik)
        return self.get(f"{self.base_submissions}/CIK{cik10}.json").json()

    def list_recent_filings(
        self,
        cik: str | int,
        forms: Iterable[str],
        limit: int = 500,
        *,
        windows: List[Tuple[str, datetime, datetime]],
        date_field: Literal["filingDate", "reportDate"] = "reportDate",
        fallback_to_filing_date: bool = True,
    ) -> List[Dict[str, Any]]:
        """
        Return a list of recent filings for specific form types filtered on a date range using either reportDate (default) or filingDate.

        Why date_field matters:
        - filingDate: when SEC received/published the filing
        - reportDate: end of reporting period (often aligns to FY/Q more cleanly)

        Why:
        - Submissions JSON stores 'recent' filings in a columnar structure:
          recent['form'][i], recent['accessionNumber'][i], etc.
        - This function converts it to row dictionaries (one per filing).
        """
        data = self.submissions(cik)
        recent = data.get("filings", {}).get("recent", {})
        if not recent:
            return []

        keys = list(recent.keys())
        n = len(recent.get("accessionNumber", []))
        rows = [{k: recent[k][i] for k in keys} for i in range(n)]

        # Filter by form type
        forms_set = set(forms)
        rows = [r for r in rows if r.get("form") in forms_set]
        
        # Filter by date windows
        filtered: List[Dict[str, Any]] = []
        for r in rows:
            dt = parse_yyyy_mm_dd(r.get(date_field))
            if dt is None and fallback_to_filing_date and date_field != "filingDate":
                dt = parse_yyyy_mm_dd(r.get("filingDate"))
            if dt is None:
                continue
            labels = match_windows(dt, windows)
            if labels:
                rr = dict(r)
                rr["matched_windows"] = labels
                rr["filter_date_used"] = dt.strftime("%Y-%m-%d")
                filtered.append(rr)

        # Sort newest to oldest by chosen date field (so limit grabs most recent within range)
        filtered.sort(
            key=lambda r: (parse_yyyy_mm_dd(r.get(date_field)) or datetime.min),
            reverse=True
        )

        return filtered[:limit]
    # -----------------------------
    # Archives utilities
    # -----------------------------
    def filing_folder(self, cik: str | int, accession: str) -> str:
        """
        Base folder for an accession, e.g.:
        https://www.sec.gov/Archives/edgar/data/{cik_int}/{accession_nodashes}

        Why:
        - Every document (HTML, text, XBRL files, exhibits) is stored here.
        """
        return f"{self.base_archives}/edgar/data/{cik_archives_path(cik)}/{accession_nodashes(accession)}"

    def filing_index_json(self, cik: str | int, accession: str) -> Dict[str, Any]:
        """
        Fetch index.json listing all files in the accession folder.

        Why:
        - "Complete filing" usually means more than one file.
        - index.json is the authoritative list of docs to download.
        """
        return self.get(f"{self.filing_folder(cik, accession)}/index.json").json()

    def download_file(self, url: str, out_path: Path) -> None:
        """
        Stream-download a file to disk.

        Why stream:
        - Some filings/exhibits are large.
        - Streaming avoids holding the entire file in memory.
        """
        ensure_dir(out_path.parent)
        r = self.get(url, stream=True)
        with out_path.open("wb") as f:
            for chunk in r.iter_content(chunk_size=1024 * 256):
                if chunk:
                    f.write(chunk)

    def download_accession_package(
        self,
        cik: str | int,
        accession: str,
        primary_document: Optional[str],
        out_dir: Path,
        *,
        include_patterns: Tuple[str, ...] = (".htm", ".html", ".txt", ".xml", ".xsd", ".cal", ".lab", ".pre", ".json"),
        exclude_patterns: Tuple[str, ...] = (".jpg", ".png", ".gif", ".svg", ".pdf"),
    ) -> Dict[str, Any]:
        """
        Download a "complete package" for an accession.

        Downloads:
        - index.json
        - primary document (if provided)
        - the accession 'complete submission text file' {accession}.txt (best-effort)
        - all other files in index.json that match include_patterns and aren't excluded

        Why include XBRL files:
        - Even if you're doing RAG, keeping the raw XBRL instance files can be useful later.
        - Some tables you care about (especially in 10-Q/10-K) may be represented in XBRL too.

        Why exclude images/PDF by default:
        - They're often heavy and less useful for text-based RAG.
        - You can include them later if you decide to OCR or image-parse.
        """
        ensure_dir(out_dir)
        folder = self.filing_folder(cik, accession)

        # 1) Save index.json (document inventory)
        idx = self.filing_index_json(cik, accession)
        (out_dir / "index.json").write_text(json.dumps(idx, indent=2), encoding="utf-8")

        # 2) Download the primary doc (often the main HTML)
        if primary_document:
            self.download_file(f"{folder}/{primary_document}", out_dir / primary_document)

        # 3) Best-effort download of "complete submission text file"
        #    Not all accessions have it, but many do.
        try:
            self.download_file(f"{folder}/{accession}.txt", out_dir / f"{accession}.txt")
        except requests.HTTPError:
            pass

        # 4) Download additional files from index.json
        items = idx.get("directory", {}).get("item", [])
        for it in items:
            name = it.get("name")
            if not name:
                continue

            lname = name.lower()

            # Exclude heavy/irrelevant types unless you explicitly want them
            if exclude_patterns and any(lname.endswith(x) for x in exclude_patterns):
                continue

            # Only include relevant doc types
            if include_patterns and not any(lname.endswith(x) for x in include_patterns):
                continue

            dest = out_dir / name
            if dest.exists():
                continue  # avoid re-downloading if already present

            self.download_file(f"{folder}/{name}", dest)

        return idx

### **Convert filing HTML to “RAG artifacts” (text + tables)**

Most LLM/RAG pipelines work best when:
- narrative text is cleaned (scripts/styles removed, normalized whitespace)
- tables are stored in a consistent format (CSV/JSON), and you keep them separately from narrative text

We’ll parse the primary HTML into:
- narrative_text.txt
- tables/table_###.csv

In [7]:
def soup_from_text(text: str) -> BeautifulSoup:
    """
    Choose the correct parser automatically.

    - XML → use XML parser
    - HTML/XHTML → use HTML parser
    """
    # Heuristic: XML docs almost always start with this
    if text.lstrip().startswith("<?xml"):
        return BeautifulSoup(text, "lxml-xml")
    else:
        return BeautifulSoup(text, "lxml")


def html_to_text_and_tables(html_path: Path) -> Tuple[str, List[pd.DataFrame]]:
    """
    Convert a filing HTML into:
    - narrative-ish text (for embeddings / summarization)
    - a list of tables (as DataFrames)

    Why BeautifulSoup:
    - lets us remove scripts/styles and extract the visible text.

    Why pandas.read_html:
    - quick way to extract all <table> tags into DataFrames.
    - filings can be messy; this won't be perfect, but it's a strong baseline.
    """
    html = html_path.read_text(errors="ignore")

    # Parse the HTML document
    soup = soup_from_text(html)

    # Remove tags that add noise to extracted text
    for tag in soup(["script", "style", "noscript"]):
        tag.decompose()

    # Extract tables
    # If there are no tables, read_html raises ValueError, so we handle that.
    try:
        tables = pd.read_html(html)
    except ValueError:
        tables = []

    # Extract visible text with line breaks preserved
    text = soup.get_text("\n")

    # Normalize excessive line breaks
    text = re.sub(r"\n{3,}", "\n\n", text).strip()

    return text, tables

### **Driver function: download filings + prepare RAG folder structure**

**RAG downloader (complete filings)**

Folder layout example:<br>
sec_rag/<br>
&ensp;CIK0000006201/<br>
&ensp;&ensp;2025-10-23_10-Q_0000006201-25-000123/<br>
&ensp;&ensp;&ensp;index.json<br>
&ensp;&ensp;&ensp;metadata.json<br>
&ensp;&ensp;&ensp;primarydoc.htm<br>
&ensp;&ensp;&ensp;narrative_text.txt<br>
&ensp;&ensp;tables/<br>
&ensp;&ensp;&ensp;table_000.csv<br>
&ensp;&ensp;&ensp;table_001.csv<br>
&ensp;&ensp;0000006201-25-000123.txt<br>
&ensp;&ensp;...<br>

Why this structure:
- easy to re-run incrementally (new accessions get new folders)
- easy to index in a vector DB (metadata + text paths)
- easy to audit / reproduce

In [8]:
def download_filings_for_rag(
    client: SecClient,
    cik: str | int,
    out_root: Path,
    forms: Iterable[str] = ("8-K", "10-Q", "10-K", "ARS"),
    limit: int = 500,
    *,
    windows: List[Tuple[str, datetime, datetime]],
    date_field: Literal["filingDate", "reportDate"] = "reportDate",
) -> None:
    """
    Download a set of filings for a company and produce RAG-friendly artifacts.

    Steps:
    1) List recent filings via Submissions JSON.
    2) For each filing accession:
       - download the accession package (primary HTML, complete submission text, and other docs)
       - extract narrative text + tables from the primary HTML (if present)
       - save metadata.json for later indexing

    Why 'limit':
    - keeps dev/testing fast
    - you can expand later once you're confident in your pipeline
    """
    cik10 = normalize_cik(cik)
    company_dir = out_root / f"CIK{cik10}"
    ensure_dir(company_dir)

    # 1) Get recent filings of the desired forms
    filings = client.list_recent_filings(
        cik,
        forms=forms,
        limit=limit,
        windows=windows,
        date_field=date_field,
    )

    for f in filings:
        # 2) Core identifiers
        accn = f["accessionNumber"]
        form = f.get("form")
        filed = f.get("filingDate")
        report_date = f.get("reportDate")
        primary = f.get("primaryDocument")
        matched = f.get("matched_windows", [])

        # 3) Create a deterministic folder per accession and skip if already done
        acc_dir = company_dir / f"{filed}_{form}_{accn}"
        if (acc_dir / "metadata.json").exists():
            continue
        ensure_dir(acc_dir)

        # 4) Download the full accession package
        client.download_accession_package(
            cik=cik,
            accession=accn,
            primary_document=primary,
            out_dir=acc_dir,
            include_patterns=(".htm", ".html", ".txt", ".xml", ".xsd", ".cal", ".lab", ".pre", ".json"),
            exclude_patterns=(".jpg", ".png", ".gif", ".svg"),
        )

        # 5) Convert primary HTML to narrative text + table CSVs
        if primary:
            primary_path = acc_dir / primary
            if primary_path.exists() and primary_path.suffix.lower() in (".htm", ".html"):
                text, tables = html_to_text_and_tables(primary_path)

                # Save narrative text for embeddings / LLM summarization
                (acc_dir / "narrative_text.txt").write_text(text, encoding="utf-8", errors="replace")

                # Save extracted tables as CSVs
                tdir = acc_dir / "tables"
                ensure_dir(tdir)

                # Cap number of tables to avoid huge output for very table-heavy documents
                for i, df in enumerate(tables[:300]):
                    df.to_csv(tdir / f"table_{i:03d}.csv", index=False)

        # 6) Save metadata for vector DB indexing and provenance
        meta = {
            "cik": cik10,
            "accession": accn,
            "form": form,
            "filed": filed,
            "reportDate": report_date,
            "primaryDocument": primary,
            "date_field_used_for_filtering": date_field,
            "matched_windows": matched,  # <- key addition
        }
        (acc_dir / "metadata.json").write_text(json.dumps(meta, indent=2), encoding="utf-8")

### **RAG Materials Download**

In [9]:
# Set up the SEC client using the previously defined user agent
client = SecClient(user_agent=USER_AGENT)

# Set your output root directory
out_root = Path("SEC_Filings/EDGAR_raw")

# Define the airlines(s) of interest and generate their CIK mapping
airlines = ["AAL", "DAL", "UAL", "LUV"]
airline_ciks = build_airline_cik_dict(airlines, ticker_map)

# Define the period(s) of interest and generate filter windows
years = [2025]
periods = ["FY"]
windows = build_date_windows(years, periods)

# Define date field to filter on (functions default to "reportDate" but this makes it explicit)
date_field = "reportDate"

# Run for all airlines present in data
for ticker, cik in airline_ciks.items():
    download_filings_for_rag(
        client,
        cik=cik,
        out_root=out_root,
        windows=windows,
        date_field=date_field,
        limit=1000,  # adjust as necessary
    )

  tables = pd.read_html(html)
  soup = BeautifulSoup(udoc, features="html5lib", from_encoding=from_encoding)
  tables = pd.read_html(html)
  soup = BeautifulSoup(udoc, features="html5lib", from_encoding=from_encoding)
  tables = pd.read_html(html)
  soup = BeautifulSoup(udoc, features="html5lib", from_encoding=from_encoding)
  tables = pd.read_html(html)
  soup = BeautifulSoup(udoc, features="html5lib", from_encoding=from_encoding)
