In [1]:
from __future__ import annotations

import json
import re
from dataclasses import dataclass
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional, Tuple

import pandas as pd



# -----------------------------
# Utilities
# -----------------------------
def read_text_safely(path: Path) -> str:
    """
    Read text with UTF-8, replacing characters that Windows cp1252 can't encode.
    This prevents UnicodeEncodeError / decoding problems later.
    """
    if not path.exists():
        return ""
    return path.read_text(encoding="utf-8", errors="replace")


def read_json(path: Path) -> Dict[str, Any]:
    if not path.exists():
        return {}
    return json.loads(path.read_text(encoding="utf-8", errors="replace"))


def iso_or_none(dt: Any) -> Optional[str]:
    """
    Normalize a datetime-ish value to ISO string, else None.
    Accepts strings already in ISO-ish format.
    """
    if dt is None:
        return None
    if isinstance(dt, str):
        return dt
    if isinstance(dt, datetime):
        return dt.isoformat()
    return str(dt)


def sanitize_filename(s: str) -> str:
    return re.sub(r"[^A-Za-z0-9._-]+", "_", s).strip("_")


def chunk_text(
    text: str,
    *,
    chunk_chars: int = 3500,
    overlap_chars: int = 250,
) -> List[str]:
    """
    Chunk long narrative text into page-like chunks (stable “page_label/total_pages”).

    Why char-based chunking:
    - Simple and robust (works for any filing text)
    - Keeps each chunk to a manageable size for embeddings/LLM
    """
    text = (text or "").strip()
    if not text:
        return []

    chunks = []
    i = 0
    n = len(text)
    while i < n:
        j = min(i + chunk_chars, n)
        chunks.append(text[i:j])
        if j == n:
            break
        i = max(0, j - overlap_chars)
    return chunks


def load_tables_as_records(tables_dir: Path, max_tables: int = 300) -> List[Dict[str, Any]]:
    """
    Load CSV tables as structured records.
    You can skip this entirely if you only want narrative text.
    """
    if not tables_dir.exists():
        return []

    table_files = sorted(tables_dir.glob("table_*.csv"))[:max_tables]
    out: List[Dict[str, Any]] = []
    for tf in table_files:
        try:
            df = pd.read_csv(tf)
            out.append({
                "file": tf.name,
                "n_rows": int(df.shape[0]),
                "n_cols": int(df.shape[1]),
                "columns": df.columns.tolist(),
                # If you want full table data embedded, uncomment:
                # "data": df.fillna("").astype(str).values.tolist()
            })
        except Exception as e:
            out.append({"file": tf.name, "error": str(e)})
    return out


def collection_filename(ticker: str, period: PeriodSpec) -> str:
    """
    Returns: TTTYYYYPP_filings.json
    Example: AAL2022Q3_filings.json
    """
    return f"{ticker.upper()}{period.key}_filings.json"


# -----------------------------
# Period Key Class
# -----------------------------

@dataclass(frozen=True)
class PeriodSpec:
    year: int      # e.g. 2022
    period: str    # "Q1","Q2","Q3","Q4","FY"

    @property
    def key(self) -> str:
        """YYYYPP → e.g. 2022Q3, 2023FY"""
        return f"{self.year}{self.period.upper()}"

    @classmethod
    def from_label(cls, label: str) -> "PeriodSpec":
        """
        Accepts labels like:
          - '2022-Q3'
          - '2022Q3'
          - '2023-FY'
        """
        m = re.match(r"(\d{4})[-]?(Q[1-4]|FY)", label.upper())
        if not m:
            raise ValueError(f"Invalid period label: {label}")
        return cls(year=int(m.group(1)), period=m.group(2))


# -----------------------------
# Core: build a single “collection JSON” for one airline and one period label
# -----------------------------

def accession_dirs_for_cik(out_root: Path, cik10: str) -> List[Path]:
    """
    Return all accession directories under sec_rag/CIK##########/
    """
    company_dir = out_root / f"CIK{cik10}"
    if not company_dir.exists():
        return []
    return [p for p in company_dir.iterdir() if p.is_dir()]


def build_collection_json_for_airline_period(
    *,
    out_root: Path,
    cik10: str,
    ticker: str,
    period: PeriodSpec,
    out_file: Path,
    include_tables: bool = True,
    chunk_chars: int = 3500,
    overlap_chars: int = 250,
) -> Path:
    """
    Creates one JSON file like your attached example: a list of records.

    Each record corresponds to a chunk (“page”) of narrative text from a filing,
    with metadata fields carried along for provenance and retrieval.
    """
    records: List[Dict[str, Any]] = []

    for acc_dir in accession_dirs_for_cik(out_root, cik10):
        meta_path = acc_dir / "metadata.json"
        meta = read_json(meta_path)
        if not meta:
            continue

        # If you used matched_windows in metadata.json, filter by the period label
        matched = meta.get("matched_windows") or []
        period_dash = f"{period.year}-{period.period.upper()}"   # "2025-Q3"
        period_nodash = period.key                               # "2025Q3"
        if (period_dash not in matched) and (period_nodash not in matched):
            continue

        form = meta.get("form") or "UNKNOWN"
        filed = meta.get("filed") or meta.get("filingDate") or meta.get("filing_date")
        report_date = meta.get("reportDate")

        # Optional: the SEC archives folder URL if you stored it; else leave blank
        source = meta.get("source") or meta.get("archives_url") or ""

        # Title (matches your example’s “Form 10-K for ... filed ...” style)
        title = meta.get("title")
        if not title:
            title = f"Form {form} for {ticker} (accession {meta.get('accession','')})"

        # Narrative text chunks
        narrative_path = acc_dir / "narrative_text.txt"
        narrative = read_text_safely(narrative_path)
        chunks = chunk_text(narrative, chunk_chars=chunk_chars, overlap_chars=overlap_chars)

        total_pages = len(chunks) if chunks else 1

        if chunks:
            for idx, chunk in enumerate(chunks, start=1):
                rec_id = f"{ticker}{period.key}-{len(records)}-{title}-Page {idx} of {total_pages}"
                records.append({
                    "id": rec_id,
                    "airline": ticker,
                    "title": title,
                    "form": form,
                    "date_filed": iso_or_none(filed),
                    "report_date": iso_or_none(report_date),
                    "page_label": str(idx),
                    "total_pages": total_pages,
                    "source": source,
                    "text": chunk,
                })
        else:
            # If no narrative_text.txt exists, still create one record with empty text
            rec_id = f"{ticker}{period.key}-{len(records)}-{title}-Page 1 of 1"
            records.append({
                "id": rec_id,
                "airline": ticker,
                "title": title,
                "form": form,
                "date_filed": iso_or_none(filed),
                "report_date": iso_or_none(report_date),
                "page_label": "1",
                "total_pages": 1,
                "source": source,
                "text": "",
            })

        # Optional: add table inventory as an extra “record” per filing
        if include_tables:
            tables_dir = acc_dir / "tables"
            table_info = load_tables_as_records(tables_dir)
            if table_info:
                rec_id = f"{ticker}{period.key}-{len(records)}-{title}-Tables"
                records.append({
                    "id": rec_id,
                    "airline": ticker,
                    "title": f"{title} (Tables)",
                    "form": form,
                    "date_filed": iso_or_none(filed),
                    "report_date": iso_or_none(report_date),
                    "page_label": "tables",
                    "total_pages": total_pages,
                    "source": source,
                    "tables": table_info,
                })

    out_file.parent.mkdir(parents=True, exist_ok=True)
    out_file.write_text(json.dumps(records, ensure_ascii=False, indent=2), encoding="utf-8")
    return out_file


In [3]:
airline_ciks = {
    #"AAL": "0000006201",
    "DAL": "0000027904"#,
    #"UAL": "0000100517",
    #"LUV": "0000092380",
}

periods_initial = [
    PeriodSpec(2014, "Q1"),
    PeriodSpec(2014, "Q2"),
    PeriodSpec(2014, "Q3"),
    PeriodSpec(2014, "Q4"),
    PeriodSpec(2014, "FY"),
    PeriodSpec(2015, "Q1"),
    PeriodSpec(2015, "Q2"),
    PeriodSpec(2015, "Q3"),
    PeriodSpec(2015, "Q4"),
    PeriodSpec(2015, "FY"),
    PeriodSpec(2016, "Q1"),
    PeriodSpec(2016, "Q2"),
    PeriodSpec(2016, "Q3"),
    PeriodSpec(2016, "Q4"),
    PeriodSpec(2016, "FY"),
    PeriodSpec(2017, "Q1"),
    PeriodSpec(2017, "Q2"),
    PeriodSpec(2017, "Q3"),
    PeriodSpec(2017, "Q4"),
    PeriodSpec(2017, "FY"),
    PeriodSpec(2018, "Q1"),
    PeriodSpec(2018, "Q2"),
    PeriodSpec(2018, "Q3"),
    PeriodSpec(2018, "Q4"),
    PeriodSpec(2018, "FY"),
    PeriodSpec(2019, "Q1"),
    PeriodSpec(2019, "Q2"),
    PeriodSpec(2019, "Q3"),
    PeriodSpec(2019, "Q4"),
    PeriodSpec(2019, "FY"),
    PeriodSpec(2020, "Q1"),
    PeriodSpec(2020, "Q2"),
    PeriodSpec(2020, "Q3"),
    PeriodSpec(2020, "Q4"),
    PeriodSpec(2020, "FY"),
    PeriodSpec(2021, "Q1"),
    PeriodSpec(2021, "Q2"),
    PeriodSpec(2021, "Q3"),
    PeriodSpec(2021, "Q4"),
    PeriodSpec(2021, "FY"),
    PeriodSpec(2022, "Q1"),
    PeriodSpec(2022, "Q2"),
    PeriodSpec(2022, "Q3"),
    PeriodSpec(2022, "Q4"),
    PeriodSpec(2022, "FY"),
    PeriodSpec(2023, "Q1"),
    PeriodSpec(2023, "Q2"),
    PeriodSpec(2023, "Q3"),
    PeriodSpec(2023, "Q4"),
    PeriodSpec(2023, "FY"),
    PeriodSpec(2024, "Q1"),
    PeriodSpec(2024, "Q2"),
    PeriodSpec(2024, "Q3"),
    PeriodSpec(2024, "Q4"),
    PeriodSpec(2024, "FY"),
    PeriodSpec(2025, "Q1"),
    PeriodSpec(2025, "Q2"),
    PeriodSpec(2025, "Q3"),
    PeriodSpec(2025, "Q4"),
    PeriodSpec(2025, "FY"),
]

periods_current = [
    PeriodSpec(2026, "Q1")
]

periods = periods_current

for ticker, cik10 in airline_ciks.items():
    for p in periods:
        out_file = Path("SEC_Filings/EDGAR") / collection_filename(ticker, p)

        build_collection_json_for_airline_period(
            out_root=Path("SEC_Filings/EDGAR_raw"),
            cik10=cik10,
            ticker=ticker,
            period=PeriodSpec.from_label(f"{p.year}-{p.period}"),
            out_file=out_file,
            include_tables=True,
        )
