In [2]:
import os
import re
import pandas as pd
import tiktoken
from typing import List, Dict, Any, Tuple, Optional
from dataclasses import dataclass
from datetime import datetime
import logging
from pathlib import Path

# Set up logging to see what's happening
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Initialize tokenizer for accurate token counting
encoding = tiktoken.encoding_for_model("text-embedding-3-small")

# =============================================================================
# 1. SEC MAPPINGS WITH FALLBACKS
# =============================================================================

ITEM_NAME_MAP_10K = {
    "1": "Business",
    "1A": "Risk Factors",
    "1B": "Unresolved Staff Comments",
    "1C": "Cybersecurity",
    "2": "Properties",
    "3": "Legal Proceedings",
    "4": "Mine Safety Disclosures",
    "5": "Market for Registrant's Common Equity, Related Stockholder Matters and Issuer Purchases of Equity Securities",
    "6": "Reserved",
    "7": "Management's Discussion and Analysis of Financial Condition and Results of Operations",
    "7A": "Quantitative and Qualitative Disclosures About Market Risk",
    "8": "Financial Statements and Supplementary Data",
    "9": "Changes in and Disagreements With Accountants on Accounting and Financial Disclosure",
    "9A": "Controls and Procedures",
    "9B": "Other Information",
    "9C": "Disclosure Regarding Foreign Jurisdictions that Prevent Inspections",
    "10": "Directors, Executive Officers and Corporate Governance",
    "11": "Executive Compensation",
    "12": "Security Ownership of Certain Beneficial Owners and Management and Related Stockholder Matters",
    "13": "Certain Relationships and Related Transactions, and Director Independence",
    "14": "Principal Accountant Fees and Services",
    "15": "Exhibits, Financial Statement Schedules",
    "16": "Form 10-K Summary"
}

ITEM_NAME_MAP_10Q_PART_I = {
    "1": "Financial Statements",
    "2": "Management's Discussion and Analysis of Financial Condition and Results of Operations",
    "3": "Quantitative and Qualitative Disclosures About Market Risk",
    "4": "Controls and Procedures",
}

ITEM_NAME_MAP_10Q_PART_II = {
    "1": "Legal Proceedings", "1A": "Risk Factors",
    "2": "Unregistered Sales of Equity Securities and Use of Proceeds",
    "3": "Defaults Upon Senior Securities", "4": "Mine Safety Disclosures",
    "5": "Other Information", "6": "Exhibits",
}

# =============================================================================
# 2. DATA STRUCTURES FOR BETTER ORGANIZATION
# =============================================================================

@dataclass
class FilingMetadata:
    """Structured metadata for a filing"""
    ticker: str
    form_type: str
    filing_date: str
    fiscal_year: int
    fiscal_quarter: int
    file_path: str

@dataclass
class DocumentSection:
    """Represents a section of the document"""
    title: str
    content: str
    section_type: str  # 'item', 'part', 'intro', 'table'
    item_number: Optional[str] = None
    part: Optional[str] = None
    start_pos: int = 0
    end_pos: int = 0

@dataclass
class Chunk:
    """Final chunk with all metadata"""
    chunk_id: str
    text: str
    token_count: int
    chunk_type: str  # 'narrative', 'table', 'mixed'
    section_info: str
    filing_metadata: FilingMetadata
    chunk_index: int
    has_overlap: bool = False

# =============================================================================
# 3. ROBUST TEXT CLEANING
# =============================================================================

def clean_sec_text(text: str) -> str:
    """
    Clean SEC filing text more robustly
    """
    # Remove common SEC artifacts
    text = re.sub(r'UNITED STATES\s+SECURITIES AND EXCHANGE COMMISSION.*?FORM \d+[A-Z]*', '', text, flags=re.DOTALL | re.IGNORECASE)

    # Handle page breaks more intelligently
    text = text.replace('[PAGE BREAK]', '\n\n--- PAGE BREAK ---\n\n')

    # Preserve table boundaries but clean them up
    text = re.sub(r'\[TABLE_START\]', '\n\n=== TABLE START ===\n', text)
    text = re.sub(r'\[TABLE_END\]', '\n=== TABLE END ===\n\n', text)

    # Clean up excessive whitespace but preserve paragraph structure
    text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text)  # Multiple newlines -> double newline
    text = re.sub(r'[ \t]+', ' ', text)  # Multiple spaces/tabs -> single space
    text = re.sub(r'^\s+|\s+$', '', text, flags=re.MULTILINE)  # Trim lines

    return text.strip()

# =============================================================================
# 4. MULTI-STRATEGY SECTION DETECTION
# =============================================================================

def detect_sections_strategy_1_improved(content: str) -> List[DocumentSection]:
    """
    Improved Strategy 1: Patterns based on real SEC filing structure
    """
    sections = []

    # Much more comprehensive patterns based on your actual files
    patterns = [
        # PART patterns - handle various formats
        re.compile(r'^\s*PART\s+([IVX]+)(?:\s*[-–—].*?)?$', re.I | re.M),
        re.compile(r'^PART\s+([IVX]+)(?:\s*[-–—].*?)?$', re.I | re.M),

        # ITEM patterns - much more flexible
        re.compile(r'^\s*ITEM\s+(\d{1,2}[A-C]?)(?:[.\s–—])', re.I | re.M),
        re.compile(r'^ITEM\s+(\d{1,2}[A-C]?)(?:[.\s–—])', re.I | re.M),
        re.compile(r'Item\s+(\d{1,2}[A-C]?)(?:[.\s–—])', re.I | re.M),

        # Number-dot format common in SEC filings
        re.compile(r'^(\d{1,2}[A-C]?)\.\s+[A-Z][A-Za-z\s]{10,}', re.I | re.M),

        # Content-based patterns for known sections
        re.compile(r'^.{0,50}(BUSINESS)\s*$', re.I | re.M),
        re.compile(r'^.{0,50}(RISK FACTORS)\s*$', re.I | re.M),
        re.compile(r'^.{0,50}(LEGAL PROCEEDINGS)\s*$', re.I | re.M),
        re.compile(r'^.{0,50}(FINANCIAL STATEMENTS)\s*$', re.I | re.M),
        re.compile(r'^.{0,50}(MANAGEMENT.S DISCUSSION)\s*', re.I | re.M),
        re.compile(r'^.{0,50}(PROPERTIES)\s*$', re.I | re.M),
        re.compile(r'^.{0,50}(CONTROLS AND PROCEDURES)\s*$', re.I | re.M),
    ]

    all_matches = []

    # Process each pattern
    for pattern_idx, pattern in enumerate(patterns):
        for match in pattern.finditer(content): # Use pre-compiled pattern
            # Get the full line containing this match
            line_start = content.rfind('\n', 0, match.start()) + 1
            line_end = content.find('\n', match.end())
            if line_end == -1:
                line_end = len(content)

            full_line = content[line_start:line_end].strip()

            # Filter out obvious false positives
            if (len(full_line) > 400 or  # Too long to be a header
                len(full_line) < 3 or    # Too short
                '|' in full_line or      # Likely table content
                full_line.count(' ') > 20):  # Too many words
                continue

            # Extract section identifier
            section_id = match.group(1) if match.groups() else 'unknown'

            all_matches.append({
                'start_pos': line_start, # Changed from match.start() for consistency with line-based detection
                'end_pos': line_end,     # Changed from match.end()
                'full_line': full_line,
                'section_id': section_id,
                'pattern_idx': pattern_idx,
                'match_start': match.start()
            })

    # Remove duplicates - matches within 200 characters of each other
    unique_matches = []
    for match in sorted(all_matches, key=lambda x: x['start_pos']):
        is_duplicate = any(
            abs(match['start_pos'] - existing['start_pos']) < 200
            for existing in unique_matches
        )
        if not is_duplicate:
            unique_matches.append(match)

    # Debug output
    print(f"🔍 Improved detection found {len(unique_matches)} potential sections:")
    for i, match in enumerate(unique_matches[:15]):  # Show more for debugging
        print(f"  {i+1}: {match['full_line'][:80]}...")

    # Convert to DocumentSection objects
    for i, match in enumerate(unique_matches):
        start_pos = match['start_pos']
        end_pos = unique_matches[i + 1]['start_pos'] if i + 1 < len(unique_matches) else len(content)

        section_content = content[start_pos:end_pos].strip()

        # Determine section type and metadata
        full_line_upper = match['full_line'].upper()
        section_id = match['section_id'].upper() if match['section_id'] != 'unknown' else None

        if 'PART' in full_line_upper and section_id:
            section_type = 'part'
            part = f"PART {section_id}"
            item_number = None
            title = f"Part {section_id}"
        elif ('ITEM' in full_line_upper or re.match(r'^\d+[A-C]?$', str(section_id))) and section_id:
            section_type = 'item'
            part = None
            item_number = section_id
            title = f"Item {section_id}"
        elif any(keyword in full_line_upper for keyword in
                ['BUSINESS', 'RISK', 'LEGAL', 'FINANCIAL', 'MANAGEMENT', 'PROPERTIES', 'CONTROLS']):
            section_type = 'named_section'
            part = None
            item_number = None
            title = match['full_line']
        else:
            section_type = 'content'
            part = None
            item_number = None
            title = match['full_line']

        sections.append(DocumentSection(
            title=title,
            content=section_content,
            section_type=section_type,
            item_number=item_number,
            part=part,
            start_pos=start_pos,
            end_pos=end_pos
        ))

    return sections

def detect_sections_strategy_2(content: str) -> List[DocumentSection]:
    """
    Strategy 2: Fallback using page breaks and heuristics
    """
    sections = []

    # Split by page breaks first
    pages = content.split('--- PAGE BREAK ---')

    current_section = ""
    current_title = "Document Content"

    for i, page in enumerate(pages):
        page = page.strip()
        if not page:
            continue

        # Look for section headers in the page
        lines = page.split('\n')
        potential_headers = []

        for j, line in enumerate(lines[:10]):  # Check first 10 lines of each page
            line = line.strip()
            if (len(line) < 100 and  # Headers are usually short
                (re.search(r'\b(ITEM|PART)\b', line, re.IGNORECASE) or
                 re.search(r'\b(BUSINESS|RISK FACTORS|FINANCIAL STATEMENTS)\b', line, re.IGNORECASE))):
                potential_headers.append((j, line))

        if potential_headers:
            # Found a header, start new section
            if current_section:
                sections.append(DocumentSection(
                    title=current_title,
                    content=current_section.strip(),
                    section_type='content',
                    start_pos=0,
                    end_pos=len(current_section)
                ))

            current_title = potential_headers[0][1]
            current_section = page
        else:
            # Continue current section
            current_section += "\n\n" + page

    # Add the last section
    if current_section:
        sections.append(DocumentSection(
            title=current_title,
            content=current_section.strip(),
            section_type='content',
            start_pos=0,
            end_pos=len(current_section)
        ))

    return sections

# The `detect_sections_robust` function from your original code (renamed detect_sections_robust_old to avoid conflict)
def detect_sections_robust_old(content: str) -> List[DocumentSection]:
    """
    Multi-strategy section detection with fallbacks (original version)
    """
    logger.info("Attempting Strategy 1: Regex-based section detection")
    sections = detect_sections_strategy_1_improved(content) # Original called detect_sections_strategy_1, updated to _improved

    if len(sections) >= 3:  # A reasonable number of sections to consider it successful
        logger.info(f"Strategy 1 successful: Found {len(sections)} sections")
        return sections

    logger.warning("Strategy 1 failed, trying Strategy 2: Page-based detection")
    sections = detect_sections_strategy_2(content)

    if len(sections) >= 2:
        logger.info(f"Strategy 2 successful: Found {len(sections)} sections")
        return sections

    logger.warning("All strategies failed, creating single section")
    return [DocumentSection(
        title="Full Document",
        content=content,
        section_type='document',
        start_pos=0,
        end_pos=len(content)
    )]

def create_section_info(section: DocumentSection, form_type: str) -> str:
    """
    Create human-readable section information for DocumentSection objects,
    using form_type to select the correct item name map.
    """
    item_number = section.item_number
    section_type = section.section_type
    part_number = section.part # Get part from DocumentSection, e.g., "PART I", "PART II"

    if section_type == 'item' and item_number:
        if form_type == '10K':
            item_name = ITEM_NAME_MAP_10K.get(item_number, "Unknown Section")
            return f"Item {item_number} - {item_name}"
        elif form_type == '10Q':
            # Use part_number from DocumentSection to decide which 10Q map to use
            if part_number == 'PART I':
                item_name = ITEM_NAME_MAP_10Q_PART_I.get(item_number, "Unknown Section")
                return f"Part I, Item {item_number} - {item_name}"
            elif part_number == 'PART II':
                item_name = ITEM_NAME_MAP_10Q_PART_II.get(item_number, "Unknown Section")
                return f"Part II, Item {item_number} - {item_name}"
            else:
                # Fallback if part not clearly identified, try both maps
                if item_number in ITEM_NAME_MAP_10Q_PART_I:
                    item_name = ITEM_NAME_MAP_10Q_PART_I[item_number]
                    return f"Part I, Item {item_number} - {item_name}"
                elif item_number in ITEM_NAME_MAP_10Q_PART_II:
                    item_name = ITEM_NAME_MAP_10Q_PART_II[item_number]
                    return f"Part II, Item {item_number} - {item_name}"
                return f"Item {item_number} - Unknown 10Q Section"
    
    elif section_type == 'part' and part_number:
        # If it's a PART section, check if it also includes an item title, as some PARTs have "PART I. FINANCIAL INFORMATION"
        if "Item" in section.title and section.item_number:
            # This handles cases like "PART I - Item 1. Financial Statements" if detect_sections_universal_sec captures it that way
            return f"{part_number} - {section.title.replace(part_number, '').strip(' -.')}"
        return part_number # Just return "PART I", "PART II" etc.

    # Fallback for named_section, content, or document type sections
    return section.title or "Document Content"


def detect_sections_universal_sec(content: str) -> List[DocumentSection]:
    """
    Universal section detection for SEC filings with table-based formatting.
    Improved regex patterns for better capture of Item/Part numbers and titles.
    """
    sections = []

    if not content:
        logger.info("Empty content provided to detect_sections_universal_sec. Returning empty sections.")
        return sections

    # Universal patterns for table-formatted SEC filings
    # Using raw strings `r` and explicitly handling whitespace `\s*` and literal characters.
    # Compiling patterns once for efficiency.
    patterns = [
        # Table-based ITEM patterns with variable whitespace and optional period after item number
        re.compile(r'(?i)\[TABLE_START\]\s*Item\s*(\d{1,2}[A-C]?)\.?\s*\|\s*([^\[]+?)\s*\[TABLE_END\]', re.DOTALL),
        re.compile(r'(?i)\[TABLE_START\]\s*Item\s*(\d{1,2}[A-C]?)\.?\s*\|\s*([^|]+)', re.DOTALL),

        # Table-based PART patterns with variable whitespace
        re.compile(r'(?i)\[TABLE_START\]\s*PART\s*([IVX]+)\s*\|\s*([^\[]+?)\s*\[TABLE_END\]', re.DOTALL),
        re.compile(r'(?i)\[TABLE_START\]\s*PART\s*([IVX]+)\s*\|\s*([^|]+)', re.DOTALL),
        re.compile(r'(?i)\[TABLE_START\]\s*PART\s*([IVX]+)\s*\[TABLE_END\]', re.DOTALL),

        # Standalone ITEM patterns (strong indicators, start of line)
        re.compile(r'^\s*Item\s*(\d{1,2}[A-C]?)\.?\s*([^\n]+)', re.I | re.M),
        # Standalone ITEM patterns (pipe-separated but not necessarily table-wrapped)
        re.compile(r'Item\s*(\d{1,2}[A-C]?)\.?\s*\|\s*([^|]+)', re.I | re.DOTALL),

        # Standalone PART patterns (strong indicators, start of line)
        re.compile(r'^\s*PART\s*([IVX]+)\s*([^\n]*)', re.I | re.M),
        # Standalone PART patterns (pipe-separated)
        re.compile(r'PART\s*([IVX]+)\s*\|\s*([^|]+)', re.I | re.DOTALL),

        # Number-dot format (e.g., "1. Business" not necessarily preceded by "Item")
        re.compile(r'^\s*(\d{1,2}[A-C]?)\.\s+[A-Z][A-Za-z\s]{10,}', re.I | re.M),
        # Number-only pattern in tables (e.g. "[TABLE_START] 1. | Business")
        re.compile(r'(?i)\[TABLE_START\]\s*(\d{1,2}[A-C]?)\.?\s*\|\s*([^|]+)', re.DOTALL),

        # Generic Section Titles that often appear as headers
        re.compile(r'^\s*(BUSINESS|RISK FACTORS|LEGAL PROCEEDINGS|FINANCIAL STATEMENTS|MANAGEMENT\'S DISCUSSION AND ANALYSIS|PROPERTIES|CONTROLS AND PROCEDURES)\s*$', re.I | re.M)
    ]

    all_matches = []

    for pattern_idx, pattern in enumerate(patterns):
        for match in pattern.finditer(content):
            # Determine content boundaries for the "line" containing the match
            line_start = content.rfind('\n', 0, match.start()) + 1
            line_end = content.find('\n', match.end())
            if line_end == -1:
                line_end = len(content)

            full_line = content[line_start:line_end].strip()

            # Filter out obvious false positives
            if (len(full_line) > 400 or  # Too long to be a header
                len(full_line) < 3 or    # Too short (e.g., "1.")
                full_line.count(' ') > 20):  # Too many words, likely not a header
                continue

            # Heuristic to filter out TOC entries that might match general patterns
            if any(toc_indicator in full_line.lower() for toc_indicator in ['table of contents', 'index']):
                continue
            
            # Extract section identifier and title more carefully
            section_id = None
            section_title = full_line # Default to full line

            groups = match.groups()
            if len(groups) > 0:
                potential_id = groups[0].strip()
                # Check if it's an Item/Part ID based on common patterns (e.g., "1", "1A", "I", "II")
                if re.match(r'^\d+[A-C]?$', potential_id, re.I) or re.match(r'^[IVX]+$', potential_id, re.I):
                    section_id = potential_id
                    if len(groups) > 1 and groups[1]: # If a title group was also captured
                        section_title = groups[1].strip()
                        section_title = re.sub(re.escape('[TABLE_END]') + r'.*', '', section_title, flags=re.I).strip()
                        section_title = section_title.replace('|', '').strip()
                    elif 'Item' in full_line or 'PART' in full_line:
                        # Extract title after "Item X." or "PART X"
                        clean_line = re.sub(r'^\s*(Item|PART)\s*\d*[A-C]*[IVX]*\.?\s*[-–—]?\s*', '', full_line, flags=re.I).strip()
                        if clean_line and len(clean_line) < 200: # Ensure extracted title isn't too long
                            section_title = clean_line
                        else: # Fallback if clean_line is too long or empty
                             section_title = full_line # Still use full line as title if too complex
                else: # If the first group was not an ID, treat as generic title
                    section_title = full_line
                    # Attempt to extract ID if it's a known named section (e.g., "BUSINESS")
                    if 'BUSINESS' in full_line.upper(): section_id = '1'
                    elif 'RISK FACTORS' in full_line.upper(): section_id = '1A'
                    # Add other named section mappings if needed. These will typically be caught by the direct regex anyway.

            # Store the original start/end of the line for correct content extraction
            all_matches.append({
                'start_pos': line_start,
                'end_pos': line_end,
                'full_line': full_line,
                'section_id': section_id if section_id else 'unknown', # Default to 'unknown' if no ID found
                'section_title': section_title,
                'pattern_idx': pattern_idx,
                'match_start': match.start() # Keep for internal sorting preference
            })

    # Sort matches primarily by start_pos, secondarily by pattern_idx (to prefer more specific patterns)
    all_matches.sort(key=lambda x: (x['start_pos'], x['pattern_idx']))

    # Filter duplicate/overlapping matches. Prioritize more specific patterns (lower pattern_idx).
    final_matches = []
    if all_matches:
        final_matches.append(all_matches[0])
        for i in range(1, len(all_matches)):
            current_match = all_matches[i]
            last_added_match = final_matches[-1]

            # If current match starts very close to the last added match,
            # consider if it's a duplicate or a better alternative.
            if current_match['start_pos'] - last_added_match['start_pos'] < 100: # Within 100 chars
                # Prefer matches with a specific Item/Part ID over 'unknown' or 'content'
                if current_match['section_id'] != 'unknown' and last_added_match['section_id'] == 'unknown':
                    final_matches[-1] = current_match
                # If same ID (e.g., multiple "Item 1" mentions), keep the earliest one unless a stronger pattern comes up
                elif current_match['section_id'] == last_added_match['section_id'] and current_match['pattern_idx'] < last_added_match['pattern_idx']:
                    final_matches[-1] = current_match # Replace with higher priority pattern
                # Otherwise, if it's too close and not a better candidate, skip as duplicate
            else:
                final_matches.append(current_match) # Add if sufficiently far apart

    logger.info(f"🔍 Universal SEC detection found {len(final_matches)} unique sections:")
    for i, match in enumerate(final_matches[:15]):
        logger.info(f"  {i+1}: Item/Part {match['section_id']} - {match['section_title'][:60]}...")

    # Convert to DocumentSection objects
    final_document_sections = []
    current_part = None # Track current part for 10Q item context

    for i, match in enumerate(final_matches):
        start_pos = match['start_pos']
        # End position is the start of the next matched section, or end of content if it's the last one
        end_pos = final_matches[i + 1]['start_pos'] if i + 1 < len(final_matches) else len(content)

        section_content = content[start_pos:end_pos].strip()

        section_id = match['section_id'].upper()
        title = match['section_title']

        section_type = 'content' # Default type
        item_number = None
        part = None

        if re.match(r'^[IVX]+$', section_id):
            section_type = 'part'
            part = f"PART {section_id}"
            current_part = part # Update current part for subsequent items
            # Refine title to be just the part if it's a generic capture
            if title.upper().startswith("PART ") and title.upper().replace("PART ", "").strip() == section_id:
                title = part
            elif not title:
                title = part
        elif re.match(r'^\d+[A-C]?$', section_id):
            section_type = 'item'
            item_number = section_id
            part = current_part # Assign current part context to this item
            # Refine title to be just the item if it's a generic capture
            if title.upper().startswith("ITEM ") and title.upper().replace("ITEM ", "").strip() == section_id:
                title = f"Item {item_number}"
            elif not title:
                title = f"Item {item_number}"
        # For named_section, title is already the full_line or specific keyword match
        elif any(keyword in title.upper() for keyword in ['BUSINESS', 'RISK FACTORS', 'LEGAL PROCEEDINGS', 'FINANCIAL STATEMENTS', 'MANAGEMENT\'S DISCUSSION', 'PROPERTIES', 'CONTROLS AND PROCEDURES']):
            section_type = 'named_section'


        final_document_sections.append(DocumentSection(
            title=title,
            content=section_content,
            section_type=section_type,
            item_number=item_number,
            part=part, # Store the part info (either detected directly or inherited)
            start_pos=start_pos,
            end_pos=end_pos
        ))

    return final_document_sections

def detect_sections_from_toc_universal(content: str) -> List[DocumentSection]:
    """
    Extract sections from table of contents - works for any SEC filing.
    This function primarily identifies section titles and item numbers from TOC,
    but does not extract their content directly.
    """
    sections = []

    if not content:
        logger.info("Empty content provided to detect_sections_from_toc_universal. Returning empty sections.")
        return sections

    # Look for table of contents patterns. Using re.escape for literal parts.
    toc_patterns = [
        re.compile(r'(?i)INDEX.*?(?=\s*--- PAGE BREAK ---)', re.DOTALL),
        re.compile(r'(?i)TABLE OF CONTENTS.*?(?=\s*--- PAGE BREAK ---)', re.DOTALL),
        re.compile(r'(?i)FORM 10-[KQ].*?INDEX.*?(?=\s*--- PAGE BREAK ---)', re.DOTALL),
        re.compile(re.escape('[TABLE_START]') + r'.*?Page.*?' + re.escape('[TABLE_END]') + r'.*?(?=\s*--- PAGE BREAK ---)', re.DOTALL),
    ]

    toc_content = ""
    for pattern in toc_patterns:
        match = pattern.search(content)
        if match:
            toc_content = match.group(0)
            break

    if not toc_content:
        logger.warning("No table of contents found in detect_sections_from_toc_universal.")
        return sections

    logger.info(f"Found table of contents ({len(toc_content)} chars)")

    # Define patterns for items/parts within the TOC
    # CORRECTED: Relaxed whitespace and optional period for item numbers.
    # Also made "Item" and "PART" literal words, not regex metacharacters.
    item_patterns = [
        # Example: "Item 1. Financial Statements | 3"
        re.compile(r'(?i)Item\s*(\d{1,2}[A-C]?)\.?\s*\|\s*([^|]+?)\s*\|\s*\d+', re.DOTALL),
        # Example: "PART I | FINANCIAL INFORMATION"
        re.compile(r'(?i)PART\s*([IVX]+)\s*\|\s*([^|]+)', re.DOTALL),
        # Example: "Item 1A. Risk Factors" (not in table, without page number)
        re.compile(r'(?i)Item\s*(\d{1,2}[A-C]?)\.?\s*([^\n|]+)', re.M),
        # Example: "1. | Financial Statements | 3" (starting with number, in table)
        re.compile(r'(?i)(\d{1,2}[A-C]?)\.?\s*\|\s*([^|]+?)\s*\|\s*\d+', re.DOTALL),
        # Example: "PART II" (simple part declaration)
        re.compile(r'(?i)PART\s*([IVX]+)', re.M)
    ]

    found_items = []
    # Only try to find items if TOC content was found
    if toc_content:
        for pattern in item_patterns:
            for match in pattern.finditer(toc_content):
                groups = match.groups()
                item_id = None
                item_title = ""

                if len(groups) >= 2: # Pattern captured both ID and Title
                    item_id = groups[0].strip()
                    item_title = groups[1].strip()
                elif len(groups) == 1: # Pattern only captured ID
                    item_id = groups[0].strip()
                    # Attempt to get text immediately following the ID match in the TOC line
                    line_remainder_start = match.end()
                    line_end_of_match = toc_content.find('\n', line_remainder_start)
                    if line_end_of_match == -1:
                        line_end_of_match = len(toc_content)
                    
                    potential_title_from_line = toc_content[line_remainder_start:line_end_of_match].strip()
                    if potential_title_from_line:
                        item_title = potential_title_from_line
                    else:
                        item_title = f"Section {item_id}" # Fallback generic title

                if item_id: # Ensure an ID was captured
                    item_title = re.sub(r'\s+', ' ', item_title).strip() # Normalize whitespace
                    found_items.append((item_id, item_title))

    unique_items = []
    seen = set()
    # Sort found items by their ID for more consistent processing, then by title for tie-breaking
    found_items.sort(key=lambda x: (x[0], x[1]))

    for item_id, title in found_items:
        # Create a unique key for deduplication, focusing on ID and a portion of title
        key = f"{item_id}_{title[:50]}"
        if key not in seen:
            unique_items.append((item_id, title))
            seen.add(key)

    logger.info(f"Extracted {len(unique_items)} sections from table of contents:")
    for item_id, title in unique_items[:10]:
        logger.info(f"  • {item_id}: {title[:50]}...")

    toc_sections = []
    current_part = None # Track current part for items found in TOC

    for item_id, title in unique_items:
        section_type = 'unknown'
        item_number = None
        part_num = None # Initial value

        if re.match(r'^\d+[A-C]?$', item_id):
            section_type = 'item'
            item_number = item_id
            part_num = current_part # Assign the last seen part to this item
        elif re.match(r'^[IVX]+$', item_id):
            section_type = 'part'
            part_num = f"PART {item_id}"
            current_part = part_num # Update the current part context
        else:
            section_type = 'content' # Treat as generic content section

        toc_sections.append(DocumentSection(
            title=title,
            content="", # Content is intentionally empty here; will be filled by main sectioning if this strategy is chosen.
            section_type=section_type,
            item_number=item_number,
            part=part_num # Store the identified part (either detected or inherited)
        ))
    return toc_sections


def detect_sections_robust_universal(content: str) -> List[DocumentSection]:
    """
    Universal robust section detection for all SEC filings.
    Prioritizes direct pattern matching (which handles tables well), then TOC, then page-based.
    """
    logger.info("Attempting universal SEC section detection")

    # Strategy 1: Direct pattern matching for sections (designed to work well with common SEC patterns)
    sections_strategy1 = detect_sections_universal_sec(content)

    if len(sections_strategy1) >= 3:
        logger.info(f"Universal detection successful (Strategy 1): Found {len(sections_strategy1)} sections.")
        return sections_strategy1

    # Strategy 2: Try parsing Table of Contents.
    logger.warning("Direct detection found few sections, analyzing table of contents.")
    toc_entries = detect_sections_from_toc_universal(content) # These are DocumentSections with only title/metadata, no content

    if toc_entries and len(toc_entries) >= 3: # If TOC parsing yielded a good number of entries
        logger.info(f"TOC analysis found {len(toc_entries)} potential sections. Attempting to extract content based on TOC titles.")

        combined_sections = []
        current_content_pos = 0

        # Sort toc_entries by their expected appearance in the document if they don't have start_pos
        # This is crucial for iterating and finding them correctly in the content.
        # If TOC parsing doesn't give start_pos, rely on the sequence.
        # If TOC parsing gives parts/items, sort by those.
        toc_entries_sorted = sorted(toc_entries, key=lambda x: (x.part if x.part else '', x.item_number if x.item_number else '', x.title))


        for i, toc_entry in enumerate(toc_entries_sorted): # Iterate through sorted TOC entries
            # Create flexible regex for the title/item number to find it in the main content
            pattern_parts = []
            if toc_entry.item_number:
                # Be flexible about "Item" prefix and trailing period
                pattern_parts.append(r'Item\s*' + re.escape(toc_entry.item_number) + r'\.?')
            if toc_entry.part:
                # Be flexible about "PART" prefix
                pattern_parts.append(r'PART\s*' + re.escape(toc_entry.part.replace("PART ", "")))
            
            # Use the full title as a fallback if item/part number is not explicit or title is more descriptive
            if toc_entry.title:
                # Ensure escaped title allows for flexible whitespace in the content
                pattern_parts.append(re.escape(toc_entry.title).replace('\\ ', '\\s*'))

            if not pattern_parts: # Skip if no pattern can be formed for this TOC entry
                continue

            # Combine all potential ways to match this section's header
            search_pattern = re.compile(r'(?i)^\s*(?:' + '|'.join(pattern_parts) + r')', re.M)
            
            # Search from current_content_pos to ensure sequential parsing
            match = search_pattern.search(content, pos=current_content_pos)

            if match:
                start_pos = match.start()
                
                # The content for this section goes until the start of the next TOC entry, or end of document
                next_start_pos = len(content)
                if i + 1 < len(toc_entries_sorted): # Check the next entry in the *sorted* list
                    next_toc_entry = toc_entries_sorted[i+1]
                    next_pattern_parts = []
                    if next_toc_entry.item_number:
                        next_pattern_parts.append(r'Item\s*' + re.escape(next_toc_entry.item_number) + r'\.?')
                    if next_toc_entry.part:
                        next_pattern_parts.append(r'PART\s*' + re.escape(next_toc_entry.part.replace("PART ", "")))
                    if next_toc_entry.title:
                        next_pattern_parts.append(re.escape(next_toc_entry.title).replace('\\ ', '\\s*'))

                    if next_pattern_parts:
                        next_pattern = re.compile(r'(?i)^\s*(?:' + '|'.join(next_pattern_parts) + r')', re.M)
                        # Search for the next section starting *after* the current match's end
                        next_match = next_pattern.search(content, pos=match.end())
                        if next_match:
                            next_start_pos = next_match.start()
                
                section_content = content[start_pos:next_start_pos].strip()
                
                combined_sections.append(DocumentSection(
                    title=toc_entry.title,
                    content=section_content,
                    section_type=toc_entry.section_type,
                    item_number=toc_entry.item_number,
                    part=toc_entry.part, # Preserve the part info derived from TOC
                    start_pos=start_pos,
                    end_pos=next_start_pos
                ))
                current_content_pos = next_start_pos
            else:
                logger.warning(f"Could not find content for TOC entry: '{toc_entry.title}'. This section might be merged with previous or skipped.")
                # If a TOC entry is not found, its content might be part of the previous section,
                # or it's a false positive in the TOC. For simplicity, we just move on.

        if len(combined_sections) >= 3:
            logger.info(f"Universal detection successful (TOC-based content mapping): Found {len(combined_sections)} sections.")
            return combined_sections
        else:
            logger.warning("TOC-based content mapping yielded few sections. Falling back to page-based detection.")


    # Strategy 3: Page-based fallback (original strategy 2)
    logger.warning("Trying page-based detection as fallback.")
    sections_strategy2 = detect_sections_strategy_2(content)

    if len(sections_strategy2) >= 2:
        logger.info(f"Page-based detection successful: Found {len(sections_strategy2)} sections.")
        return sections_strategy2

    # Final fallback: return the entire document as a single section
    logger.warning("All strategies failed, creating single section.")
    return [DocumentSection(
        title="Full Document",
        content=content,
        section_type='document',
        start_pos=0,
        end_pos=len(content)
    )]


# Helper function to extract metadata from filename
def extract_metadata_from_filename(file_path: str) -> FilingMetadata:
    filename = Path(file_path).name
    file_id = filename.replace(".txt", "")
    parts = file_id.split('_')

    if len(parts) != 3:
        logger.warning(f"Malformed filename: {filename}. Using default metadata.")
        return FilingMetadata(
            ticker="UNKNOWN",
            form_type="UNKNOWN",
            filing_date="1900-01-01",
            fiscal_year=1900,
            fiscal_quarter=1,
            file_path=file_path
        )

    ticker, form_type, filing_date_str = parts

    try:
        filing_date = pd.to_datetime(filing_date_str)
        fiscal_year = filing_date.year
        fiscal_quarter = filing_date.quarter
    except pd.errors.ParserError:
        logger.error(f"Could not parse filing date from {filing_date_str} in {filename}. Using default values.")
        fiscal_year = 1900
        fiscal_quarter = 1

    # Adjust fiscal year for 10-K filings if the filing date is early in the calendar year
    # and typically refers to the previous fiscal year end.
    if form_type == '10K' and filing_date.month <= 3: # Assuming fiscal year ends typically in Dec or Jan-Mar for previous year
        fiscal_year -= 1 # Often a 10K filed in Jan-Mar of current year is for previous fiscal year

    return FilingMetadata(
        ticker=ticker,
        form_type=form_type,
        filing_date=filing_date_str,
        fiscal_year=fiscal_year,
        fiscal_quarter=fiscal_quarter,
        file_path=file_path
    )


# =============================================================================
# MAIN PROCESSING FUNCTION (Universal)
# =============================================================================
def process_filing_robust_universal(file_path: str, target_tokens: int = 500, overlap_tokens: int = 100) -> List[Chunk]:
    """
    Universal processing function for all SEC filings
    """
    try:
        # Extract filing metadata
        filing_metadata = extract_metadata_from_filename(file_path)
        filename = Path(file_path).name # For logging clarity
        file_id = filename.replace(".txt", "") # For chunk_id creation

        # Read and clean content
        with open(file_path, 'r', encoding='utf-8') as f:
            raw_content = f.read()
        cleaned_content = clean_sec_text(raw_content)

        # Basic check for empty content after cleaning
        if not cleaned_content.strip():
            logger.warning(f"Cleaned content for {filename} is empty. No chunks created.")
            return []

        # Use universal section detection
        sections = detect_sections_robust_universal(cleaned_content)
        logger.info(f"Found {len(sections)} sections in {filename}")

        # Process each section
        all_chunks = []
        chunk_counter = 0

        for section in sections:
            # Ensure section.content is not empty before processing
            if not section.content.strip():
                continue # Skip empty sections

            # Extract tables and narrative from this section's content
            tables_in_section, narrative_content_in_section = extract_and_process_tables(section.content)

            # Create section info string using the original create_section_info
            section_info = create_section_info(section, filing_metadata.form_type)

            # Process tables found within this section
            for table in tables_in_section:
                chunk = Chunk(
                    chunk_id=f"{file_id}-chunk-{chunk_counter:04d}",
                    text=table['text'],
                    token_count=table['token_count'],
                    chunk_type='table',
                    section_info=section_info,
                    filing_metadata=filing_metadata,
                    chunk_index=chunk_counter,
                    has_overlap=False
                )
                all_chunks.append(chunk)
                chunk_counter += 1

            # Process narrative content from this section
            if narrative_content_in_section.strip():
                # Use the existing create_overlapping_chunks for narrative
                narrative_sub_chunks = create_overlapping_chunks(
                    narrative_content_in_section, target_tokens, overlap_tokens
                )

                for chunk_data in narrative_sub_chunks:
                    chunk = Chunk(
                        chunk_id=f"{file_id}-chunk-{chunk_counter:04d}",
                        text=chunk_data['text'],
                        token_count=chunk_data['token_count'],
                        chunk_type='narrative',
                        section_info=section_info,
                        filing_metadata=filing_metadata,
                        chunk_index=chunk_counter,
                        has_overlap=chunk_data['has_overlap']
                    )
                    all_chunks.append(chunk)
                    chunk_counter += 1

        logger.info(f"Created {len(all_chunks)} chunks for {filename}")
        return all_chunks

    except Exception as e:
        logger.error(f"Error processing {file_path}: {e}")
        return []

# =============================================================================
# 5. IMPROVED SENTENCE-AWARE CHUNKING
# =============================================================================

def split_into_sentences(text: str) -> List[str]:
    """
    Split text into sentences using multiple heuristics
    """
    # Simple sentence splitting (can be improved with spaCy/NLTK)
    sentences = re.split(r'(?<=[.!?])\s+(?=[A-Z])', text)

    # Clean up sentences
    sentences = [s.strip() for s in sentences if s.strip()]

    return sentences

def create_overlapping_chunks(text: str, target_tokens: int = 500, overlap_tokens: int = 100,
                            min_tokens: int = 50) -> List[Dict[str, Any]]:
    """
    Create semantically aware chunks with overlap
    """
    sentences = split_into_sentences(text)
    chunks = []

    current_chunk_sentences = []
    current_tokens = 0

    for i, sentence in enumerate(sentences):
        sentence_tokens = len(encoding.encode(sentence))

        # If adding this sentence exceeds target, finalize current chunk
        if current_tokens + sentence_tokens > target_tokens and current_chunk_sentences:
            chunk_text = ' '.join(current_chunk_sentences)
            chunks.append({
                'text': chunk_text,
                'token_count': current_tokens,
                'sentence_count': len(current_chunk_sentences),
                'has_overlap': len(chunks) > 0
            })

            # Create overlap: keep last few sentences
            overlap_sentences = []
            current_overlap_tokens = 0

            for sent_idx in range(len(current_chunk_sentences) - 1, -1, -1):
                sent = current_chunk_sentences[sent_idx]
                sent_tokens = len(encoding.encode(sent))
                if current_overlap_tokens + sent_tokens <= overlap_tokens:
                    overlap_sentences.insert(0, sent)
                    current_overlap_tokens += sent_tokens
                else:
                    break
            
            if not overlap_sentences and current_chunk_sentences:
                overlap_sentences = [current_chunk_sentences[-1]]
                current_overlap_tokens = len(encoding.encode(overlap_sentences[0]))


            current_chunk_sentences = overlap_sentences + [sentence]
            current_tokens = current_overlap_tokens + sentence_tokens
        else:
            current_chunk_sentences.append(sentence)
            current_tokens += sentence_tokens

    if current_chunk_sentences:
        chunk_text = ' '.join(current_chunk_sentences)
        final_tokens = len(encoding.encode(chunk_text))

        if final_tokens >= min_tokens:
            chunks.append({
                'text': chunk_text,
                'token_count': final_tokens,
                'sentence_count': len(current_chunk_sentences),
                'has_overlap': len(chunks) > 0
            })

    return chunks

# =============================================================================
# 6. TABLE HANDLING
# =============================================================================

def extract_and_process_tables(content: str) -> Tuple[List[Dict], str]:
    """
    Extract tables and return both table chunks and narrative text
    """
    table_pattern = re.compile(r'=== TABLE START ===.*?=== TABLE END ===', re.DOTALL)
    tables = []

    # Find all tables
    for i, match in enumerate(table_pattern.finditer(content)):
        table_content = match.group(0)
        table_text = table_content.replace('=== TABLE START ===', '').replace('=== TABLE END ===', '').strip()

        if table_text:
            tables.append({
                'text': table_text,
                'token_count': len(encoding.encode(table_text)),
                'table_index': i,
                'chunk_type': 'table'
            })

    # Remove tables from content to get narrative text
    narrative_content = table_pattern.sub('', content).strip()

    return tables, narrative_content

# =============================================================================
# 8. TESTING AND VALIDATION
# =============================================================================

def validate_chunks(chunks: List[Chunk]) -> Dict[str, Any]:
    """
    Validate the quality of our chunks
    """
    if not chunks:
        return {"error": "No chunks created"}

    token_counts = [chunk.token_count for chunk in chunks]

    stats = {
        "total_chunks": len(chunks),
        "avg_tokens": sum(token_counts) / len(token_counts),
        "min_tokens": min(token_counts),
        "max_tokens": max(token_counts),
        "chunks_with_overlap": sum(1 for chunk in chunks if chunk.has_overlap),
        "table_chunks": sum(1 for chunk in chunks if chunk.chunk_type == 'table'),
        "narrative_chunks": sum(1 for chunk in chunks if chunk.chunk_type == 'narrative'),
        "unique_sections": len(set(chunk.section_info for chunk in chunks))
    }

    return stats

# =============================================================================
# 9. LET'S TEST THIS!
# =============================================================================

print("🚀 SEC Filing Preprocessing Strategy - Ready for Testing!\n")
print("="*60)
print("Key improvements over original approach:\n")
print("✅ Multi-strategy section detection with fallbacks\n")
print("✅ Sentence-aware chunking with overlap\n")
print("✅ Robust error handling and logging\n")
print("✅ Structured data classes for better organization\n")
print("✅ Quality validation and statistics\n")
print("✅ Separate table and narrative processing\n")
print("="*60)


def test_single_file():
    """Test our preprocessing on a single file"""
    test_file = "processed_filings/AAPL/AAPL_10K_2020-10-30.txt"

    if os.path.exists(test_file):
        print(f"🧪 Testing with: {test_file}\n")
        print("="*50)

        chunks = process_filing_robust_universal(test_file)
        stats = validate_chunks(chunks)

        print("📊 Processing Results:\n")
        for key, value in stats.items():
            print(f"  {key}: {value}\n")

        print("\n📝 Sample Chunks:\n")
        for i, chunk in enumerate(chunks[:3]):
            print(f"\nChunk {i+1} ({chunk.chunk_type}):\n")
            print(f"  Section: {chunk.section_info}\n")
            print(f"  Tokens: {chunk.token_count}\n")
            print(f"  Text preview: {chunk.text[:200]}...\n")

        return chunks
    else:
        print(f"❌ File not found: {test_file}\n")
        print("Please update the file path to match your data structure\n")
        return []

chunks = test_single_file()

def compare_section_strategies(content: str):
    """Compare how different strategies perform"""
    print("🔍 Comparing Section Detection Strategies\n")
    print("="*50)

    sections_1 = detect_sections_strategy_1_improved(content)
    print(f"Strategy 1 (Regex): {len(sections_1)} sections\n")
    for i, section in enumerate(sections_1[:5]):
        print(f"  {i+1}. {section.title[:60]}...\n")

    print()

    sections_2 = detect_sections_strategy_2(content)
    print(f"Strategy 2 (Page-based): {len(sections_2)} sections\n")
    for i, section in enumerate(sections_2[:5]):
        print(f"  {i+1}. {section.title[:60]}...\n")

    return sections_1, sections_2

if chunks:
    test_file = chunks[0].filing_metadata.file_path
    with open(test_file, 'r', encoding='utf-8') as f:
        full_content_for_comparison = f.read()
    cleaned_content_for_comparison = clean_sec_text(full_content_for_comparison)

    sections_1_comp, sections_2_comp = compare_section_strategies(cleaned_content_for_comparison)


def analyze_chunking_quality(chunks: List[Chunk]):
    """Deep dive into chunk quality"""
    if not chunks:
        print("No chunks to analyze\n")
        return

    print("📊 Chunking Quality Analysis\n")
    print("="*50)

    token_counts = [chunk.token_count for chunk in chunks]

    print(f"Token Distribution:\n")
    print(f"  Mean: {sum(token_counts)/len(token_counts):.1f}\n")
    print(f"  Median: {sorted(token_counts)[len(token_counts)//2]}\n")
    print(f"  Min: {min(token_counts)}\n")
    print(f"  Max: {max(token_counts)}\n")

    print(f"\nChunk Types:\n")
    chunk_types = {}
    for chunk in chunks:
        chunk_types[chunk.chunk_type] = chunk_types.get(chunk.chunk_type, 0) + 1
    for chunk_type, count in chunk_types.items():
        print(f"  {chunk_type}: {count}\n")

    print(f"\nSection Distribution:\n")
    sections_dist = {}
    for chunk in chunks:
        sections_dist[chunk.section_info] = sections_dist.get(chunk.section_info, 0) + 1
    for section, count in sorted(sections_dist.items()):
        print(f"  {section}: {count} chunks\n")

    overlap_count = sum(1 for chunk in chunks if chunk.has_overlap)
    print(f"\nOverlap Analysis:\n")
    print(f"  Chunks with overlap: {overlap_count}/{len(chunks)} ({overlap_count/len(chunks)*100:.1f}%)\n")

    return {
        'token_stats': {
            'mean': sum(token_counts)/len(token_counts),
            'median': sorted(token_counts)[len(token_counts)//2],
            'min': min(token_counts),
            'max': max(token_counts)
        },
        'chunk_types': chunk_types,
        'sections': sections_dist,
        'overlap_rate': overlap_count/len(chunks)
    }

if chunks:
    quality_analysis = analyze_chunking_quality(chunks)


def test_chunking_parameters():
    """Test different parameter combinations"""
    if not chunks:
        print("No test file processed yet\n")
        return

    test_file = chunks[0].filing_metadata.file_path

    print("🔧 Testing Different Chunking Parameters\n")
    print("="*50)

    param_configs = [
        {"target_tokens": 300, "overlap_tokens": 50, "name": "Small chunks, low overlap"},
        {"target_tokens": 500, "overlap_tokens": 100, "name": "Medium chunks, medium overlap"},
        {"target_tokens": 800, "overlap_tokens": 150, "name": "Large chunks, high overlap"},
    ]

    results = {}

    for config in param_configs:
        print(f"\n🧪 Testing: {config['name']}\n")
        test_chunks = process_filing_robust_universal(
            test_file,
            target_tokens=config['target_tokens'],
            overlap_tokens=config['overlap_tokens']
        )

        stats = validate_chunks(test_chunks)
        results[config['name']] = stats

        print(f"  Total chunks: {stats['total_chunks']}\n")
        print(f"  Avg tokens: {stats['avg_tokens']:.1f}\n")
        print(f"  Overlap rate: {stats['chunks_with_overlap']}/{stats['total_chunks']}\n")

    return results

param_results = test_chunking_parameters()


def test_error_handling():
    """Test how our system handles various edge cases"""
    print("🛡️ Testing Error Handling\n")
    print("="*50)

    print("Test 1: Non-existent file\n")
    fake_chunks = process_filing_robust_universal("non_existent_file.txt")
    print(f"  Result: {len(fake_chunks)} chunks (expected 0)\n")

    print("\nTest 2: Empty content\n")
    empty_sections = detect_sections_robust_universal("")
    print(f"  Result: {len(empty_sections)} sections\n")

    print("\nTest 3: Malformed filename\n")
    import tempfile
    with tempfile.NamedTemporaryFile(mode='w', suffix='_bad_name.txt', delete=False) as f:
        f.write("Some content")
        temp_file = f.name

    bad_chunks = process_filing_robust_universal(temp_file)
    print(f"  Result: {len(bad_chunks)} chunks (expected 0)\n")

    os.unlink(temp_file)

    print("\nTest 4: Very short text\n")
    short_chunks = create_overlapping_chunks("Short text.", target_tokens=500)
    print(f"  Result: {len(short_chunks)} chunks\n")

test_error_handling()


def test_batch_processing(max_files: int = 5):
    """Test processing multiple files"""
    print(f"🔄 Testing Batch Processing (max {max_files} files)\n")
    print("="*50)

    data_path = "processed_filings/"
    if not os.path.exists(data_path):
        print(f"❌ Data path not found: {data_path}\n")
        return []

    all_files = []
    for root, dirs, files in os.walk(data_path):
        for file in files:
            if file.endswith('.txt'):
                all_files.append(os.path.join(root, file))

    test_files = all_files[:max_files]
    print(f"Processing {len(test_files)} files...\n")

    all_results = []

    for i, file_path in enumerate(test_files):
        print(f"  {i+1}/{len(test_files)}: {os.path.basename(file_path)}\n")

        file_chunks = process_filing_robust_universal(file_path)
        stats = validate_chunks(file_chunks)

        all_results.append({
            'file': os.path.basename(file_path),
            'chunks': len(file_chunks),
            'avg_tokens': stats.get('avg_tokens', 0),
            'sections': stats.get('unique_sections', 0),
            'tables': stats.get('table_chunks', 0)
        })

    print(f"\n📊 Batch Processing Summary:\n")
    total_chunks = sum(r['chunks'] for r in all_results)
    avg_chunks_per_file = total_chunks / len(all_results) if all_results else 0

    print(f"  Total files processed: {len(all_results)}\n")
    print(f"  Total chunks created: {total_chunks}\n")
    print(f"  Average chunks per file: {avg_chunks_per_file:.1f}\n")

    print(f"\n📋 Per-file results:\n")
    for result in all_results:
        print(f"  {result['file']}: {result['chunks']} chunks, {result['sections']} sections, {result['tables']} tables\n")

    return all_results

batch_results = test_batch_processing(max_files=3)


def create_analysis_summary():
    """Create a comprehensive summary of our preprocessing"""
    print("📈 Final Analysis Summary\n")
    print("="*60)

    if 'chunks' not in globals() or not chunks:
        print("No chunks to analyze - run test_single_file() first\n")
        return

    chunk_data = []
    for chunk in chunks:
        chunk_data.append({
            'chunk_id': chunk.chunk_id,
            'tokens': chunk.token_count,
            'type': chunk.chunk_type,
            'section': chunk.section_info,
            'has_overlap': chunk.has_overlap,
            'ticker': chunk.filing_metadata.ticker,
            'form_type': chunk.filing_metadata.form_type,
            'fiscal_year': chunk.filing_metadata.fiscal_year
        })

    df = pd.DataFrame(chunk_data)

    print("🎯 Key Insights:\n")
    print(f"  • Document: {df['ticker'].iloc[0]} {df['form_type'].iloc[0]} (FY{df['fiscal_year'].iloc[0]})\n")
    print(f"  • Total chunks: {len(df)}\n")
    print(f"  • Average chunk size: {df['tokens'].mean():.0f} tokens\n")
    print(f"  • Size range: {df['tokens'].min()} - {df['tokens'].max()} tokens\n")
    print(f"  • Overlap rate: {(df['has_overlap'].sum() / len(df) * 100):.1f}%\n")

    print(f"\n📊 Chunk Distribution by Type:\n")
    type_dist = df['type'].value_counts()
    for chunk_type, count in type_dist.items():
        percentage = (count / len(df)) * 100
        print(f"  • {chunk_type}: {count} chunks ({percentage:.1f}%)\n")

    print(f"\n📚 Section Breakdown:\n")
    section_dist = df['section'].value_counts()
    for section, count in section_dist.head(8).items():
        print(f"  • {section}: {count} chunks\n")

    print(f"\n✅ Quality Metrics:\n")
    small_chunks = df[df['tokens'] < 50]
    print(f"  • Very small chunks (<50 tokens): {len(small_chunks)} ({len(small_chunks)/len(df)*100:.1f}%)\n")

    large_chunks = df[df['tokens'] > 800]
    print(f"  • Large chunks (>800 tokens): {len(large_chunks)} ({len(large_chunks)/len(df)*100:.1f}%)\n")

    unique_sections = df['section'].nunique()
    print(f"  • Unique sections identified: {unique_sections}\n")

    print(f"\n🔍 Sample Chunks for Review:\n")
    for chunk_type in df['type'].unique():
        sample = df[df['type'] == chunk_type].iloc[0]
        chunk_obj = next(c for c in chunks if c.chunk_id == sample['chunk_id'])
        print(f"\n  {chunk_type.upper()} example ({sample['tokens']} tokens):\n")
        print(f"    Section: {sample['section']}\n")
        print(f"    Preview: {chunk_obj.text[:150]}...\n")

    return df

summary_df = create_analysis_summary()


def compare_with_original():
    """Compare our approach with the original chunking strategy"""
    print("⚖️ Comparison: New vs Original Approach\n")
    print("="*60)

    improvements = [
        "✅ Multi-strategy section detection (fallbacks for robustness)",
        "✅ Sentence-aware chunking (preserves semantic boundaries)",
        "✅ Overlapping chunks (maintains context across boundaries)",
        "✅ Separate table processing (handles structured data better)",
        "✅ Comprehensive error handling (graceful degradation)",
        "✅ Rich metadata structure (better for search/filtering)",
        "✅ Quality validation (ensures chunk coherence)",
        "✅ Configurable parameters (tunable for different use cases)"
    ]

    potential_tradeoffs = [
        "⚠️ Slightly more complex code (but more maintainable)",
        "⚠️ More chunks due to overlap (but better retrieval)",
        "⚠️ Processing takes longer (but more robust results)"
    ]

    print("🚀 Key Improvements:\n")
    for improvement in improvements:
        print(f"  {improvement}\n")

    print(f"\n⚖️ Potential Tradeoffs:\n")
    for tradeoff in potential_tradeoffs:
        print(f"  {tradeoff}\n")

    print(f"\n🎯 Recommended Next Steps:\n")
    next_steps = [
        "1. Test on more diverse filings to validate robustness",
        "2. Fine-tune chunking parameters based on embedding performance",
        "3. Add semantic similarity checks between overlapping chunks",
        "4. Implement incremental processing for large datasets",
        "5. Add support for other SEC forms (8-K, DEF 14A, etc.)",
        "6. Create embedding quality metrics and evaluation"
    ]

    for step in next_steps:
        print(f"  {step}\n")

    print("\n" + "="*60)
    print("🎉 Preprocessing Strategy Testing Complete!\n")
    print("="*60)
    print("Next step: Convert this notebook into modular Python files\n")
    print("Then: Implement the embedding pipeline and MCP server!\n")
    print("="*60)

compare_with_original()

print("🚀 Ready to test universal SEC detection!\n")
print("\n1. Run test_universal_detection_fixed() to test all files\n")
print("2. Run compare_old_vs_universal_fixed() to see the improvement\n")
print("3. Run quick_pattern_test_fixed() to see what patterns match\n")

# Define the _fixed test functions so they are available when called below
def test_universal_detection_fixed():
    """Test the universal detection on all your file types"""

    test_files = [
        "processed_filings/AAPL/AAPL_10K_2020-10-30.txt",
        "processed_filings/AMZN/AMZN_10K_2023-02-03.txt",
        "processed_filings/AMZN/AMZN_10Q_2024-11-01.txt",
        "processed_filings/KO/KO_10Q_2020-07-22.txt"
    ]

    results = {}

    for test_file in test_files:
        if not os.path.exists(test_file):
            print(f"⚠️ Skipping {test_file} - file not found\n")
            continue

        print(f"\n🧪 Testing: {test_file}\n")
        print("=" * 80)

        with open(test_file, 'r', encoding='utf-8') as f:
            content = f.read()

        sections = detect_sections_robust_universal(content)

        print(f"\n✅ Found {len(sections)} sections:\n")
        for i, section in enumerate(sections[:10]):
            print(f"  {i+1}. {section.title}\n")
            print(f"     Type: {section.section_type}, Length: {len(section.content):,} chars\n")

        chunks = process_filing_robust_universal(test_file)
        stats = validate_chunks(chunks) if chunks else {"error": "No chunks created"}

        results[test_file] = {
            'sections': len(sections),
            'chunks': len(chunks) if chunks else 0,
            'stats': stats
        }

        print(f"\n📊 Processing Results:\n")
        for key, value in stats.items():
            print(f"  {key}: {value}\n")

        if chunks:
            section_counts = {}
            for chunk in chunks[:20]:
                section = chunk.section_info
                section_counts[section] = section_counts.get(section, 0) + 1

            print(f"\n📚 Section Distribution (sample):\n")
            for section, count in sorted(section_counts.items()):
                print(f"  • {section}: {count} chunks\n")

    print(f"\n" + "="*80)
    print("📊 UNIVERSAL DETECTION SUMMARY\n")
    print("="*80)

    for file_path, result in results.items():
        filename = file_path.split('/')[-1]
        print(f"{filename:<25} | {result['sections']:>2} sections | {result['chunks']:>3} chunks\n")

    return results

def compare_old_vs_universal_fixed():
    """Compare the old detection vs universal detection"""
    test_file = "processed_filings/AAPL/AAPL_10K_2020-10-30.txt"

    if not os.path.exists(test_file):
        print("Test file not found for comparison\n")
        return

    print("⚖️ OLD vs UNIVERSAL Detection Comparison\n")
    print("="*60)

    with open(test_file, 'r', encoding='utf-8') as f:
        content = f.read()

    print("Running old detection...\n")
    old_sections = detect_sections_robust_old(content)

    print("Running universal detection...\n")
    new_sections = detect_sections_robust_universal(content)

    print(f"\n📊 Comparison Results:\n")
    print(f"  Old detection: {len(old_sections)} sections\n")
    print(f"  Universal detection: {len(new_sections)} sections\n")
    print(f"  Improvement: +{len(new_sections) - len(old_sections)} sections\n")

    print(f"\n📋 Old Sections:\n")
    for i, section in enumerate(old_sections):
        print(f"  {i+1}. {section.title}\n")

    print(f"\n📋 Universal Sections:\n")
    for i, section in enumerate(new_sections):
        print(f"  {i+1}. {section.title}\n")

    return old_sections, new_sections

def quick_pattern_test_fixed():
    """Quick test to see what patterns match in your content"""
    test_file = "processed_filings/AAPL/AAPL_10K_2020-10-30.txt"

    if not os.path.exists(test_file):
        print("Test file not found\n")
        return

    print("🔍 QUICK PATTERN TEST\n")
    print("="*50)

    with open(test_file, 'r', encoding='utf-8') as f:
        content = f.read()

    patterns = [
        (re.compile(r'\[TABLE_START\](?:.|\n)*?Item(?:.|\n)*?\[TABLE_END\]', re.I | re.DOTALL), "Table-wrapped Items"),
        (re.compile(r'Item\s+\d+[A-C]?\.\s*\|', re.I), "Pipe-separated Items"),
        (re.compile(r'PART\s+[IVX]+', re.I), "Part headers"),
        (re.compile(r'\[TABLE_START\](?:.|\n)*?PART(?:.|\n)*?\[TABLE_END\]', re.I | re.DOTALL), "Table-wrapped Parts"),
    ]

    for compiled_pattern, description in patterns:
        matches = compiled_pattern.findall(content)
        print(f"\n{description}: {len(matches)} matches\n")
        for i, match in enumerate(matches[:3]):
            clean_match = ' '.join(match.split())[:100]
            print(f"  {i+1}: {clean_match}...\n")

# Run the fixed tests
results_universal = test_universal_detection_fixed()
old_vs_new_sections = compare_old_vs_universal_fixed()
quick_pattern_test_fixed()


# =============================================================================
# MAIN PROCESSING FUNCTION (Universal)
# =============================================================================
def process_filing_robust_universal(file_path: str, target_tokens: int = 500, overlap_tokens: int = 100) -> List[Chunk]:
    """
    Universal processing function for all SEC filings
    """
    try:
        # Extract filing metadata
        filing_metadata = extract_metadata_from_filename(file_path)
        filename = Path(file_path).name # For logging clarity
        file_id = filename.replace(".txt", "") # For chunk_id creation

        # Read and clean content
        with open(file_path, 'r', encoding='utf-8') as f:
            raw_content = f.read()
        cleaned_content = clean_sec_text(raw_content)

        # Basic check for empty content after cleaning
        if not cleaned_content.strip():
            logger.warning(f"Cleaned content for {filename} is empty. No chunks created.")
            return []

        # Use universal section detection
        sections = detect_sections_robust_universal(cleaned_content)
        logger.info(f"Found {len(sections)} sections in {filename}")

        # Process each section
        all_chunks = []
        chunk_counter = 0

        for section in sections:
            # Ensure section.content is not empty before processing
            if not section.content.strip():
                continue # Skip empty sections

            # Extract tables and narrative from this section's content
            tables_in_section, narrative_content_in_section = extract_and_process_tables(section.content)

            # Create section info string using the original create_section_info
            section_info = create_section_info(section, filing_metadata.form_type)

            # Process tables found within this section
            for table in tables_in_section:
                chunk = Chunk(
                    chunk_id=f"{file_id}-chunk-{chunk_counter:04d}",
                    text=table['text'],
                    token_count=table['token_count'],
                    chunk_type='table',
                    section_info=section_info,
                    filing_metadata=filing_metadata,
                    chunk_index=chunk_counter,
                    has_overlap=False
                )
                all_chunks.append(chunk)
                chunk_counter += 1

            # Process narrative content from this section
            if narrative_content_in_section.strip():
                # Use the existing create_overlapping_chunks for narrative
                narrative_sub_chunks = create_overlapping_chunks(
                    narrative_content_in_section, target_tokens, overlap_tokens
                )

                for chunk_data in narrative_sub_chunks:
                    chunk = Chunk(
                        chunk_id=f"{file_id}-chunk-{chunk_counter:04d}",
                        text=chunk_data['text'],
                        token_count=chunk_data['token_count'],
                        chunk_type='narrative',
                        section_info=section_info,
                        filing_metadata=filing_metadata,
                        chunk_index=chunk_counter,
                        has_overlap=chunk_data['has_overlap']
                    )
                    all_chunks.append(chunk)
                    chunk_counter += 1

        logger.info(f"Created {len(all_chunks)} chunks for {filename}")
        return all_chunks

    except Exception as e:
        logger.error(f"Error processing {file_path}: {e}")
        return []

# =============================================================================
# 5. IMPROVED SENTENCE-AWARE CHUNKING
# =============================================================================

def split_into_sentences(text: str) -> List[str]:
    """
    Split text into sentences using multiple heuristics
    """
    # Simple sentence splitting (can be improved with spaCy/NLTK)
    sentences = re.split(r'(?<=[.!?])\s+(?=[A-Z])', text)

    # Clean up sentences
    sentences = [s.strip() for s in sentences if s.strip()]

    return sentences

def create_overlapping_chunks(text: str, target_tokens: int = 500, overlap_tokens: int = 100,
                            min_tokens: int = 50) -> List[Dict[str, Any]]:
    """
    Create semantically aware chunks with overlap
    """
    sentences = split_into_sentences(text)
    chunks = []

    current_chunk_sentences = []
    current_tokens = 0

    for i, sentence in enumerate(sentences):
        sentence_tokens = len(encoding.encode(sentence))

        # If adding this sentence exceeds target, finalize current chunk
        if current_tokens + sentence_tokens > target_tokens and current_chunk_sentences:
            chunk_text = ' '.join(current_chunk_sentences)
            chunks.append({
                'text': chunk_text,
                'token_count': current_tokens,
                'sentence_count': len(current_chunk_sentences),
                'has_overlap': len(chunks) > 0
            })

            # Create overlap: keep last few sentences
            overlap_sentences = []
            current_overlap_tokens = 0 # Renamed variable to avoid conflict with function parameter 'overlap_tokens'

            # Add sentences from the end until we reach overlap target
            # Ensure we don't go past the start of the chunk
            for sent_idx in range(len(current_chunk_sentences) - 1, -1, -1):
                sent = current_chunk_sentences[sent_idx]
                sent_tokens = len(encoding.encode(sent))
                if current_overlap_tokens + sent_tokens <= overlap_tokens:
                    overlap_sentences.insert(0, sent)
                    current_overlap_tokens += sent_tokens
                else:
                    break
            
            # If after trying to create overlap, we still don't have enough tokens for overlap
            # (e.g., first few sentences are very long), just take some minimal content.
            if not overlap_sentences and current_chunk_sentences:
                # Fallback to last sentence if no other overlap possible and current chunk exists
                overlap_sentences = [current_chunk_sentences[-1]]
                current_overlap_tokens = len(encoding.encode(overlap_sentences[0]))


            # Start new chunk with overlap + current sentence
            current_chunk_sentences = overlap_sentences + [sentence]
            current_tokens = current_overlap_tokens + sentence_tokens
        else:
            # Add sentence to current chunk
            current_chunk_sentences.append(sentence)
            current_tokens += sentence_tokens

    # Add final chunk if it has content
    if current_chunk_sentences:
        chunk_text = ' '.join(current_chunk_sentences)
        final_tokens = len(encoding.encode(chunk_text))

        if final_tokens >= min_tokens:
            chunks.append({
                'text': chunk_text,
                'token_count': final_tokens,
                'sentence_count': len(current_chunk_sentences),
                'has_overlap': len(chunks) > 0
            })

    return chunks

# =============================================================================
# 6. TABLE HANDLING
# =============================================================================

def extract_and_process_tables(content: str) -> Tuple[List[Dict], str]:
    """
    Extract tables and return both table chunks and narrative text
    """
    table_pattern = re.compile(r'=== TABLE START ===.*?=== TABLE END ===', re.DOTALL)
    tables = []

    # Find all tables
    for i, match in enumerate(table_pattern.finditer(content)):
        table_content = match.group(0)
        # Clean table markers
        table_text = table_content.replace('=== TABLE START ===', '').replace('=== TABLE END ===', '').strip()

        if table_text:  # Only add non-empty tables
            tables.append({
                'text': table_text,
                'token_count': len(encoding.encode(table_text)),
                'table_index': i,
                'chunk_type': 'table'
            })

    # Remove tables from content to get narrative text
    narrative_content = table_pattern.sub('', content).strip()

    return tables, narrative_content

# =============================================================================
# 8. TESTING AND VALIDATION
# =============================================================================

def validate_chunks(chunks: List[Chunk]) -> Dict[str, Any]:
    """
    Validate the quality of our chunks
    """
    if not chunks:
        return {"error": "No chunks created"}

    token_counts = [chunk.token_count for chunk in chunks]

    stats = {
        "total_chunks": len(chunks),
        "avg_tokens": sum(token_counts) / len(token_counts),
        "min_tokens": min(token_counts),
        "max_tokens": max(token_counts),
        "chunks_with_overlap": sum(1 for chunk in chunks if chunk.has_overlap),
        "table_chunks": sum(1 for chunk in chunks if chunk.chunk_type == 'table'),
        "narrative_chunks": sum(1 for chunk in chunks if chunk.chunk_type == 'narrative'),
        "unique_sections": len(set(chunk.section_info for chunk in chunks))
    }

    return stats

# =============================================================================
# 9. LET'S TEST THIS!
# =============================================================================

print("🚀 SEC Filing Preprocessing Strategy - Ready for Testing!\n")
print("="*60)
print("Key improvements over original approach:\n")
print("✅ Multi-strategy section detection with fallbacks\n")
print("✅ Sentence-aware chunking with overlap\n")
print("✅ Robust error handling and logging\n")
print("✅ Structured data classes for better organization\n")
print("✅ Quality validation and statistics\n")
print("✅ Separate table and narrative processing\n")
print("="*60)


def test_single_file():
    """Test our preprocessing on a single file"""
    # Replace with an actual file path from your processed_filings directory
    test_file = "processed_filings/AAPL/AAPL_10K_2020-10-30.txt"

    if os.path.exists(test_file):
        print(f"🧪 Testing with: {test_file}\n")
        print("="*50)

        # Changed to universal processing function
        chunks = process_filing_robust_universal(test_file)
        stats = validate_chunks(chunks)

        print("📊 Processing Results:\n")
        for key, value in stats.items():
            print(f"  {key}: {value}\n")

        print("\n📝 Sample Chunks:\n")
        for i, chunk in enumerate(chunks[:3]):  # Show first 3 chunks
            print(f"\nChunk {i+1} ({chunk.chunk_type}):\n")
            print(f"  Section: {chunk.section_info}\n")
            print(f"  Tokens: {chunk.token_count}\n")
            print(f"  Text preview: {chunk.text[:200]}...\n")

        return chunks
    else:
        print(f"❌ File not found: {test_file}\n")
        print("Please update the file path to match your data structure\n")
        return []

# Run the test
chunks = test_single_file()

def compare_section_strategies(content: str): # Changed content_sample to content to use full content
    """Compare how different strategies perform"""
    print("🔍 Comparing Section Detection Strategies\n")
    print("="*50)

    # Strategy 1: Robust regex
    sections_1 = detect_sections_strategy_1_improved(content) # Changed content_sample to content
    print(f"Strategy 1 (Regex): {len(sections_1)} sections\n")
    for i, section in enumerate(sections_1[:5]):  # Show first 5
        print(f"  {i+1}. {section.title[:60]}...\n")

    print()

    # Strategy 2: Page-based fallback
    sections_2 = detect_sections_strategy_2(content) # Changed content_sample to content
    print(f"Strategy 2 (Page-based): {len(sections_2)} sections\n")
    for i, section in enumerate(sections_2[:5]):  # Show first 5
        print(f"  {i+1}. {section.title[:60]}...\n")

    return sections_1, sections_2

# Test if we have chunks from previous test
if chunks:
    # Use the first chunk's filing to get the full content
    test_file = chunks[0].filing_metadata.file_path
    with open(test_file, 'r', encoding='utf-8') as f:
        # Load full content for comparison, not just a sample
        full_content_for_comparison = f.read()
    cleaned_content_for_comparison = clean_sec_text(full_content_for_comparison) # Clean it for consistent comparison

    sections_1_comp, sections_2_comp = compare_section_strategies(cleaned_content_for_comparison)


def analyze_chunking_quality(chunks: List[Chunk]):
    """Deep dive into chunk quality"""
    if not chunks:
        print("No chunks to analyze\n")
        return

    print("📊 Chunking Quality Analysis\n")
    print("="*50)

    # Token distribution
    token_counts = [chunk.token_count for chunk in chunks]

    print(f"Token Distribution:\n")
    print(f"  Mean: {sum(token_counts)/len(token_counts):.1f}\n")
    print(f"  Median: {sorted(token_counts)[len(token_counts)//2]}\n")
    print(f"  Min: {min(token_counts)}\n")
    print(f"  Max: {max(token_counts)}\n")

    # Chunk types
    chunk_types = {}
    for chunk in chunks:
        chunk_types[chunk.chunk_type] = chunk_types.get(chunk.chunk_type, 0) + 1

    print(f"\nChunk Types:\n")
    for chunk_type, count in chunk_types.items():
        print(f"  {chunk_type}: {count}\n")

    # Section distribution
    sections_dist = {} # Renamed to avoid conflict with `sections` list
    for chunk in chunks:
        sections_dist[chunk.section_info] = sections_dist.get(chunk.section_info, 0) + 1

    print(f"\nSection Distribution:\n")
    for section, count in sorted(sections_dist.items()):
        print(f"  {section}: {count} chunks\n")

    # Overlap analysis
    overlap_count = sum(1 for chunk in chunks if chunk.has_overlap)
    print(f"\nOverlap Analysis:\n")
    print(f"  Chunks with overlap: {overlap_count}/{len(chunks)} ({overlap_count/len(chunks)*100:.1f}%)\n")

    return {
        'token_stats': {
            'mean': sum(token_counts)/len(token_counts),
            'median': sorted(token_counts)[len(token_counts)//2],
            'min': min(token_counts),
            'max': max(token_counts)
        },
        'chunk_types': chunk_types,
        'sections': sections_dist,
        'overlap_rate': overlap_count/len(chunks)
    }

# Analyze our test chunks
if chunks:
    quality_analysis = analyze_chunking_quality(chunks)


def test_chunking_parameters():
    """Test different parameter combinations"""
    if not chunks:
        print("No test file processed yet\n")
        return

    test_file = chunks[0].filing_metadata.file_path

    print("🔧 Testing Different Chunking Parameters\n")
    print("="*50)

    # Test different parameter combinations
    param_configs = [
        {"target_tokens": 300, "overlap_tokens": 50, "name": "Small chunks, low overlap"},
        {"target_tokens": 500, "overlap_tokens": 100, "name": "Medium chunks, medium overlap"},
        {"target_tokens": 800, "overlap_tokens": 150, "name": "Large chunks, high overlap"},
    ]

    results = {}

    for config in param_configs:
        print(f"\n🧪 Testing: {config['name']}\n")
        # Changed to universal processing function
        test_chunks = process_filing_robust_universal(
            test_file,
            target_tokens=config['target_tokens'],
            overlap_tokens=config['overlap_tokens']
        )

        stats = validate_chunks(test_chunks)
        results[config['name']] = stats

        print(f"  Total chunks: {stats['total_chunks']}\n")
        print(f"  Avg tokens: {stats['avg_tokens']:.1f}\n")
        print(f"  Overlap rate: {stats['chunks_with_overlap']}/{stats['total_chunks']}\n")

    return results

# Test different parameters
param_results = test_chunking_parameters()


def test_error_handling():
    """Test how our system handles various edge cases"""
    print("🛡️ Testing Error Handling\n")
    print("="*50)

    # Test 1: Non-existent file
    print("Test 1: Non-existent file\n")
    # Changed to universal processing function
    fake_chunks = process_filing_robust_universal("non_existent_file.txt")
    print(f"  Result: {len(fake_chunks)} chunks (expected 0)\n")

    # Test 2: Empty file
    print("\nTest 2: Empty content\n")
    empty_sections = detect_sections_robust_universal("") # Changed to universal detection
    print(f"  Result: {len(empty_sections)} sections\n")

    # Test 3: Malformed filename
    print("\nTest 3: Malformed filename\n")
    # Create a temporary file with bad name
    import tempfile
    with tempfile.NamedTemporaryFile(mode='w', suffix='_bad_name.txt', delete=False) as f:
        f.write("Some content")
        temp_file = f.name

    # Changed to universal processing function
    bad_chunks = process_filing_robust_universal(temp_file)
    print(f"  Result: {len(bad_chunks)} chunks (expected 0)\n")

    # Clean up
    os.unlink(temp_file)

    # Test 4: Very short text
    print("\nTest 4: Very short text\n")
    # This call is correct, as create_overlapping_chunks is a helper
    short_chunks = create_overlapping_chunks("Short text.", target_tokens=500)
    print(f"  Result: {len(short_chunks)} chunks\n")

test_error_handling()


def test_batch_processing(max_files: int = 5):
    """Test processing multiple files"""
    print(f"🔄 Testing Batch Processing (max {max_files} files)\n")
    print("="*50)

    data_path = "processed_filings/"
    if not os.path.exists(data_path):
        print(f"❌ Data path not found: {data_path}\n")
        return []

    # Get all files
    all_files = []
    for root, dirs, files in os.walk(data_path):
        for file in files:
            if file.endswith('.txt'):
                all_files.append(os.path.join(root, file))

    # Process a subset
    test_files = all_files[:max_files]
    print(f"Processing {len(test_files)} files...\n")

    all_results = []

    for i, file_path in enumerate(test_files):
        print(f"  {i+1}/{len(test_files)}: {os.path.basename(file_path)}\n")

        # Changed to universal processing function
        file_chunks = process_filing_robust_universal(file_path)
        stats = validate_chunks(file_chunks)

        all_results.append({
            'file': os.path.basename(file_path),
            'chunks': len(file_chunks),
            'avg_tokens': stats.get('avg_tokens', 0),
            'sections': stats.get('unique_sections', 0),
            'tables': stats.get('table_chunks', 0)
        })

    # Summary statistics
    print(f"\n📊 Batch Processing Summary:\n")
    total_chunks = sum(r['chunks'] for r in all_results)
    avg_chunks_per_file = total_chunks / len(all_results) if all_results else 0

    print(f"  Total files processed: {len(all_results)}\n")
    print(f"  Total chunks created: {total_chunks}\n")
    print(f"  Average chunks per file: {avg_chunks_per_file:.1f}\n")

    print(f"\n📋 Per-file results:\n")
    for result in all_results:
        print(f"  {result['file']}: {result['chunks']} chunks, {result['sections']} sections, {result['tables']} tables\n")

    return all_results

# Run batch test
batch_results = test_batch_processing(max_files=3)


def create_analysis_summary():
    """Create a comprehensive summary of our preprocessing"""
    print("📈 Final Analysis Summary\n")
    print("="*60)

    # Assumes 'chunks' variable from test_single_file() is available
    if 'chunks' not in globals() or not chunks:
        print("No chunks to analyze - run test_single_file() first\n")
        return

    # Create a mini dataset for analysis
    chunk_data = []
    for chunk in chunks:
        chunk_data.append({
            'chunk_id': chunk.chunk_id,
            'tokens': chunk.token_count,
            'type': chunk.chunk_type,
            'section': chunk.section_info,
            'has_overlap': chunk.has_overlap,
            'ticker': chunk.filing_metadata.ticker,
            'form_type': chunk.filing_metadata.form_type,
            'fiscal_year': chunk.filing_metadata.fiscal_year
        })

    df = pd.DataFrame(chunk_data)

    print("🎯 Key Insights:\n")
    print(f"  • Document: {df['ticker'].iloc[0]} {df['form_type'].iloc[0]} (FY{df['fiscal_year'].iloc[0]})\n")
    print(f"  • Total chunks: {len(df)}\n")
    print(f"  • Average chunk size: {df['tokens'].mean():.0f} tokens\n")
    print(f"  • Size range: {df['tokens'].min()} - {df['tokens'].max()} tokens\n")
    print(f"  • Overlap rate: {(df['has_overlap'].sum() / len(df) * 100):.1f}%\n")

    print(f"\n📊 Chunk Distribution by Type:\n")
    type_dist = df['type'].value_counts()
    for chunk_type, count in type_dist.items():
        percentage = (count / len(df)) * 100
        print(f"  • {chunk_type}: {count} chunks ({percentage:.1f}%)\n")

    print(f"\n📚 Section Breakdown:\n")
    section_dist = df['section'].value_counts()
    for section, count in section_dist.head(8).items():  # Top 8 sections
        print(f"  • {section}: {count} chunks\n")

    # Quality metrics
    print(f"\n✅ Quality Metrics:\n")

    # Check for very small chunks (potential issues)
    small_chunks = df[df['tokens'] < 50]
    print(f"  • Very small chunks (<50 tokens): {len(small_chunks)} ({len(small_chunks)/len(df)*100:.1f}%)\n")

    # Check for very large chunks (might need splitting)
    large_chunks = df[df['tokens'] > 800]
    print(f"  • Large chunks (>800 tokens): {len(large_chunks)} ({len(large_chunks)/len(df)*100:.1f}%)\n")

    # Check section coverage
    unique_sections = df['section'].nunique()
    print(f"  • Unique sections identified: {unique_sections}\n")

    # Show some example chunks for manual review
    print(f"\n🔍 Sample Chunks for Review:\n")

    # Show one of each type
    for chunk_type in df['type'].unique():
        sample = df[df['type'] == chunk_type].iloc[0]
        # Find the actual chunk object to get its full text
        chunk_obj = next(c for c in chunks if c.chunk_id == sample['chunk_id'])
        print(f"\n  {chunk_type.upper()} example ({sample['tokens']} tokens):\n")
        print(f"    Section: {sample['section']}\n")
        print(f"    Preview: {chunk_obj.text[:150]}...\n")

    return df

# Create final summary
summary_df = create_analysis_summary()


def compare_with_original():
    """Compare our approach with the original chunking strategy"""
    print("⚖️ Comparison: New vs Original Approach\n")
    print("="*60)

    improvements = [
        "✅ Multi-strategy section detection (fallbacks for robustness)",
        "✅ Sentence-aware chunking (preserves semantic boundaries)",
        "✅ Overlapping chunks (maintains context across boundaries)",
        "✅ Separate table processing (handles structured data better)",
        "✅ Comprehensive error handling (graceful degradation)",
        "✅ Rich metadata structure (better for search/filtering)",
        "✅ Quality validation (ensures chunk coherence)",
        "✅ Configurable parameters (tunable for different use cases)"
    ]

    potential_tradeoffs = [
        "⚠️ Slightly more complex code (but more maintainable)",
        "⚠️ More chunks due to overlap (but better retrieval)",
        "⚠️ Processing takes longer (but more robust results)"
    ]

    print("🚀 Key Improvements:\n")
    for improvement in improvements:
        print(f"  {improvement}\n")

    print(f"\n⚖️ Potential Tradeoffs:\n")
    for tradeoff in potential_tradeoffs:
        print(f"  {tradeoff}\n")

    print(f"\n🎯 Recommended Next Steps:\n")
    next_steps = [
        "1. Test on more diverse filings to validate robustness",
        "2. Fine-tune chunking parameters based on embedding performance",
        "3. Add semantic similarity checks between overlapping chunks",
        "4. Implement incremental processing for large datasets",
        "5. Add support for other SEC forms (8-K, DEF 14A, etc.)",
        "6. Create embedding quality metrics and evaluation"
    ]

    for step in next_steps:
        print(f"  {step}\n")

    print("\n" + "="*60)
    print("🎉 Preprocessing Strategy Testing Complete!\n")
    print("="*60)
    print("Next step: Convert this notebook into modular Python files\n")
    print("Then: Implement the embedding pipeline and MCP server!\n")
    print("="*60)

compare_with_original()

# Test functions adapted to _fixed suffix to avoid NameErrors from notebook re-runs
# Ensure these are called after all function definitions.
print("🚀 Ready to test universal SEC detection!\n")
print("\n1. Run test_universal_detection_fixed() to test all files\n")
print("2. Run compare_old_vs_universal_fixed() to see the improvement\n")
print("3. Run quick_pattern_test_fixed() to see what patterns match\n")

# Define the _fixed test functions so they are available when called below
def test_universal_detection_fixed():
    """Test the universal detection on all your file types"""

    test_files = [
        "processed_filings/AAPL/AAPL_10K_2020-10-30.txt",
        "processed_filings/AMZN/AMZN_10K_2023-02-03.txt",
        "processed_filings/AMZN/AMZN_10Q_2024-11-01.txt", # This file name is in the future based on current date
        "processed_filings/KO/KO_10Q_2020-07-22.txt"
    ]

    results = {}

    for test_file in test_files:
        if not os.path.exists(test_file):
            print(f"⚠️ Skipping {test_file} - file not found\n")
            continue

        print(f"\n🧪 Testing: {test_file}\n")
        print("=" * 80)

        with open(test_file, 'r', encoding='utf-8') as f:
            content = f.read()

        # Test universal detection
        sections = detect_sections_robust_universal(content)

        print(f"\n✅ Found {len(sections)} sections:\n")
        for i, section in enumerate(sections[:10]):
            print(f"  {i+1}. {section.title}\n")
            print(f"     Type: {section.section_type}, Length: {len(section.content):,} chars\n")

        # Test full pipeline
        chunks = process_filing_robust_universal(test_file)
        stats = validate_chunks(chunks) if chunks else {"error": "No chunks created"}

        results[test_file] = {
            'sections': len(sections),
            'chunks': len(chunks) if chunks else 0,
            'stats': stats
        }

        print(f"\n📊 Processing Results:\n")
        for key, value in stats.items():
            print(f"  {key}: {value}\n")

        if chunks:
            section_counts = {}
            for chunk in chunks[:20]:
                section = chunk.section_info
                section_counts[section] = section_counts.get(section, 0) + 1

            print(f"\n📚 Section Distribution (sample):\n")
            for section, count in sorted(section_counts.items()):
                print(f"  • {section}: {count} chunks\n")

    print(f"\n" + "="*80)
    print("📊 UNIVERSAL DETECTION SUMMARY\n")
    print("="*80)

    for file_path, result in results.items():
        filename = file_path.split('/')[-1]
        print(f"{filename:<25} | {result['sections']:>2} sections | {result['chunks']:>3} chunks\n")

    return results

def compare_old_vs_universal_fixed():
    """Compare the old detection vs universal detection"""
    test_file = "processed_filings/AAPL/AAPL_10K_2020-10-30.txt"

    if not os.path.exists(test_file):
        print("Test file not found for comparison\n")
        return

    print("⚖️ OLD vs UNIVERSAL Detection Comparison\n")
    print("="*60)

    with open(test_file, 'r', encoding='utf-8') as f:
        content = f.read()

    print("Running old detection...\n")
    old_sections = detect_sections_robust_old(content)

    print("Running universal detection...\n")
    new_sections = detect_sections_robust_universal(content)

    print(f"\n📊 Comparison Results:\n")
    print(f"  Old detection: {len(old_sections)} sections\n")
    print(f"  Universal detection: {len(new_sections)} sections\n")
    print(f"  Improvement: +{len(new_sections) - len(old_sections)} sections\n")

    print(f"\n📋 Old Sections:\n")
    for i, section in enumerate(old_sections):
        print(f"  {i+1}. {section.title}\n")

    print(f"\n📋 Universal Sections:\n")
    for i, section in enumerate(new_sections):
        print(f"  {i+1}. {section.title}\n")

    return old_sections, new_sections

def quick_pattern_test_fixed():
    """Quick test to see what patterns match in your content"""
    test_file = "processed_filings/AAPL/AAPL_10K_2020-10-30.txt"

    if not os.path.exists(test_file):
        print("Test file not found\n")
        return

    print("🔍 QUICK PATTERN TEST\n")
    print("="*50)

    with open(test_file, 'r', encoding='utf-8') as f:
        content = f.read()

    patterns = [
        (re.compile(r'\[TABLE_START\](?:.|\n)*?Item(?:.|\n)*?\[TABLE_END\]', re.I | re.DOTALL), "Table-wrapped Items"),
        (re.compile(r'Item\s+\d+[A-C]?\.\s*\|', re.I), "Pipe-separated Items"),
        (re.compile(r'PART\s+[IVX]+', re.I), "Part headers"),
        (re.compile(r'\[TABLE_START\](?:.|\n)*?PART(?:.|\n)*?\[TABLE_END\]', re.I | re.DOTALL), "Table-wrapped Parts"),
    ]

    for compiled_pattern, description in patterns:
        matches = compiled_pattern.findall(content)
        print(f"\n{description}: {len(matches)} matches\n")
        for i, match in enumerate(matches[:3]):
            clean_match = ' '.join(match.split())[:100]
            print(f"  {i+1}: {clean_match}...\n")

# Run the fixed tests
results_universal = test_universal_detection_fixed()
old_vs_new_sections = compare_old_vs_universal_fixed()
quick_pattern_test_fixed()

INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 0 unique sections:
INFO:__main__:Found table of contents (1367 chars)
INFO:__main__:Extracted 9 sections from table of contents:
INFO:__main__:  • 00: $...
INFO:__main__:  • 04: $...
INFO:__main__:  • 18: $...
INFO:__main__:  • 26: $...
INFO:__main__:  • 37: $...
INFO:__main__:  • 40: $...
INFO:__main__:  • 56: $...
INFO:__main__:  • 58: $...
INFO:__main__:  • 68: $...
INFO:__main__:TOC analysis found 9 potential sections. Attempting to extract content based on TOC titles.
INFO:__main__:Found 1 sections in AAPL_10K_2020-10-30.txt
INFO:__main__:Created 172 chunks for AAPL_10K_2020-10-30.txt
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 0 unique sections:
INFO:__main__:Found table of contents (1367 chars)
INFO:__main__:Extracted 9 sections from table of contents:
INFO:__main__:  • 00: $...
INFO:__main__:  • 04: $...
INFO:__main__:

🚀 SEC Filing Preprocessing Strategy - Ready for Testing!

Key improvements over original approach:

✅ Multi-strategy section detection with fallbacks

✅ Sentence-aware chunking with overlap

✅ Robust error handling and logging

✅ Structured data classes for better organization

✅ Quality validation and statistics

✅ Separate table and narrative processing

🧪 Testing with: processed_filings/AAPL/AAPL_10K_2020-10-30.txt

📊 Processing Results:

  total_chunks: 172

  avg_tokens: 379.86046511627904

  min_tokens: 38

  max_tokens: 1692

  chunks_with_overlap: 105

  table_chunks: 66

  narrative_chunks: 106

  unique_sections: 1


📝 Sample Chunks:


Chunk 1 (table):

  Section: Full Document

  Tokens: 58

  Text preview: California | 94-2404110 | (State or other jurisdiction | of incorporation or organization) | (I.R.S. Employer Identification No.) | One Apple Park Way | Cupertino | , | California | 95014 | (Address o...


Chunk 2 (table):

  Section: Full Document

  Tokens: 240

  Text 

INFO:__main__:Found 1 sections in AAPL_10K_2020-10-30.txt
INFO:__main__:Created 262 chunks for AAPL_10K_2020-10-30.txt
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 0 unique sections:
INFO:__main__:Found table of contents (1367 chars)
INFO:__main__:Extracted 9 sections from table of contents:
INFO:__main__:  • 00: $...
INFO:__main__:  • 04: $...
INFO:__main__:  • 18: $...
INFO:__main__:  • 26: $...
INFO:__main__:  • 37: $...
INFO:__main__:  • 40: $...
INFO:__main__:  • 56: $...
INFO:__main__:  • 58: $...
INFO:__main__:  • 68: $...
INFO:__main__:TOC analysis found 9 potential sections. Attempting to extract content based on TOC titles.
INFO:__main__:Found 1 sections in AAPL_10K_2020-10-30.txt
INFO:__main__:Created 172 chunks for AAPL_10K_2020-10-30.txt
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 0 unique sections:
INFO:__main__:Found table of contents (1367 chars)
INFO:__m

  Total chunks: 262

  Avg tokens: 273.5

  Overlap rate: 195/262


🧪 Testing: Medium chunks, medium overlap

  Total chunks: 172

  Avg tokens: 379.9

  Overlap rate: 105/172


🧪 Testing: Large chunks, high overlap

  Total chunks: 127

  Avg tokens: 495.8

  Overlap rate: 60/127

🛡️ Testing Error Handling

Test 1: Non-existent file

  Result: 0 chunks (expected 0)


Test 2: Empty content

  Result: 1 sections


Test 3: Malformed filename

  Result: 0 chunks (expected 0)


Test 4: Very short text

  Result: 0 chunks

🔄 Testing Batch Processing (max 3 files)

Processing 3 files...

  1/3: AMZN_10Q_2022-04-29.txt



INFO:__main__:Created 125 chunks for AMZN_10Q_2022-04-29.txt
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 0 unique sections:
INFO:__main__:Found table of contents (901 chars)
INFO:__main__:Extracted 22 sections from table of contents:
INFO:__main__:  • 1: ...
INFO:__main__:  • 1: Financial Statements...
INFO:__main__:  • 1: Legal Proceedings...
INFO:__main__:  • 1A: ...
INFO:__main__:  • 1A: Risk Factors...
INFO:__main__:  • 2: ...
INFO:__main__:  • 2: Management’s Discussion and Analysis of Financial ...
INFO:__main__:  • 2: Unregistered Sales of Equity Securities and Use of...
INFO:__main__:  • 3: ...
INFO:__main__:  • 3: Consolidated Statements of Operations...
INFO:__main__:TOC analysis found 22 potential sections. Attempting to extract content based on TOC titles.
INFO:__main__:Found 1 sections in AMZN_10Q_2020-05-01.txt
INFO:__main__:Created 195 chunks for AMZN_10Q_2020-05-01.txt
INFO:__main__:Attempting universal SEC sect

  2/3: AMZN_10Q_2020-05-01.txt

  3/3: AMZN_10Q_2020-10-30.txt


📊 Batch Processing Summary:

  Total files processed: 3

  Total chunks created: 440

  Average chunks per file: 146.7


📋 Per-file results:

  AMZN_10Q_2022-04-29.txt: 125 chunks, 1 sections, 51 tables

  AMZN_10Q_2020-05-01.txt: 195 chunks, 1 sections, 131 tables

  AMZN_10Q_2020-10-30.txt: 120 chunks, 1 sections, 48 tables

📈 Final Analysis Summary

🎯 Key Insights:

  • Document: AAPL 10K (FY2020)

  • Total chunks: 172

  • Average chunk size: 380 tokens

  • Size range: 38 - 1692 tokens

  • Overlap rate: 61.0%


📊 Chunk Distribution by Type:

  • narrative: 106 chunks (61.6%)

  • table: 66 chunks (38.4%)


📚 Section Breakdown:

  • Full Document: 172 chunks


✅ Quality Metrics:

  • Very small chunks (<50 tokens): 2 (1.2%)

  • Large chunks (>800 tokens): 3 (1.7%)

  • Unique sections identified: 1


🔍 Sample Chunks for Review:


  TABLE example (58 tokens):

    Section: Full Document

    Preview: California | 94

INFO:__main__:Found 1 sections in AAPL_10K_2020-10-30.txt
INFO:__main__:Created 172 chunks for AAPL_10K_2020-10-30.txt
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 21 unique sections:
INFO:__main__:  1: Item/Part I - [TABLE_START]...
INFO:__main__:  2: Item/Part 1A - Risk Factors...
INFO:__main__:  3: Item/Part 1B - Unresolved Staff Comments...
INFO:__main__:  4: Item/Part 2 - Properties...
INFO:__main__:  5: Item/Part 3 - Legal Proceedings...
INFO:__main__:  6: Item/Part 4 - Mine Safety Disclosures...
INFO:__main__:  7: Item/Part II - [TABLE_START]...
INFO:__main__:  8: Item/Part 6 - Reserved...
INFO:__main__:  9: Item/Part 7A - Quantitative and Qualitative Disclosures About Market Risk...
INFO:__main__:  10: Item/Part 8 - Financial Statements and Supplementary Data...
INFO:__main__:  11: Item/Part unknown - Legal Proceedings...
INFO:__main__:  12: Item/Part 9 - Changes in and Disagreements with Accountants On Accounting ...
IN


📊 Processing Results:

  total_chunks: 172

  avg_tokens: 379.86046511627904

  min_tokens: 38

  max_tokens: 1692

  chunks_with_overlap: 105

  table_chunks: 66

  narrative_chunks: 106

  unique_sections: 1


📚 Section Distribution (sample):

  • Full Document: 20 chunks


🧪 Testing: processed_filings/AMZN/AMZN_10K_2023-02-03.txt


✅ Found 21 sections:

  1. [TABLE_START]

     Type: part, Length: 13,293 chars

  2. Risk Factors

     Type: item, Length: 55,960 chars

  3. Unresolved Staff Comments

     Type: item, Length: 106 chars

  4. Properties

     Type: item, Length: 1,437 chars

  5. Legal Proceedings

     Type: item, Length: 185 chars

  6. Mine Safety Disclosures

     Type: item, Length: 113 chars

  7. [TABLE_START]

     Type: part, Length: 516 chars

  8. Reserved

     Type: item, Length: 50,497 chars

  9. Quantitative and Qualitative Disclosures About Market Risk

     Type: item, Length: 6,524 chars

  10. Financial Statements and Supplementary Data

     Type:

INFO:__main__:Found 1 sections in AMZN_10Q_2024-11-01.txt
INFO:__main__:Created 132 chunks for AMZN_10Q_2024-11-01.txt
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 8 unique sections:
INFO:__main__:  1: Item/Part I - . Financial Information...
INFO:__main__:  2: Item/Part 2 - Management's Discussion and Analysis of Financial Condition ...
INFO:__main__:  3: Item/Part 3 - Quantitative and Qualitative Disclosures About Market Risk...
INFO:__main__:  4: Item/Part 4 - Controls and Procedures...
INFO:__main__:  5: Item/Part II - . Other Information...
INFO:__main__:  6: Item/Part 1A - Risk Factors...
INFO:__main__:  7: Item/Part 2 - Unregistered Sales of Equity Securities and Use of Proceeds...
INFO:__main__:  8: Item/Part 6 - Exhibits...
INFO:__main__:Universal detection successful (Strategy 1): Found 8 sections.
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 0 unique sections:



📊 Processing Results:

  total_chunks: 132

  avg_tokens: 366.43939393939394

  min_tokens: 7

  max_tokens: 1548

  chunks_with_overlap: 81

  table_chunks: 50

  narrative_chunks: 82

  unique_sections: 1


📚 Section Distribution (sample):

  • Full Document: 20 chunks


🧪 Testing: processed_filings/KO/KO_10Q_2020-07-22.txt


✅ Found 8 sections:

  1. . Financial Information

     Type: part, Length: 115,924 chars

  2. Management's Discussion and Analysis of Financial Condition and Results of Operations

     Type: item, Length: 87,923 chars

  3. Quantitative and Qualitative Disclosures About Market Risk

     Type: item, Length: 207 chars

  4. Controls and Procedures

     Type: item, Length: 1,004 chars

  5. . Other Information

     Type: part, Length: 248 chars

  6. Risk Factors

     Type: item, Length: 11,661 chars

  7. Unregistered Sales of Equity Securities and Use of Proceeds

     Type: item, Length: 2,127 chars

  8. Exhibits

     Type: item, Length: 13,918 chars



INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 0 unique sections:
INFO:__main__:Found table of contents (1367 chars)
INFO:__main__:Extracted 9 sections from table of contents:
INFO:__main__:  • 00: $...
INFO:__main__:  • 04: $...
INFO:__main__:  • 18: $...
INFO:__main__:  • 26: $...
INFO:__main__:  • 37: $...
INFO:__main__:  • 40: $...
INFO:__main__:  • 56: $...
INFO:__main__:  • 58: $...
INFO:__main__:  • 68: $...
INFO:__main__:TOC analysis found 9 potential sections. Attempting to extract content based on TOC titles.
INFO:__main__:Found 1 sections in AAPL_10K_2020-10-30.txt
INFO:__main__:Created 172 chunks for AAPL_10K_2020-10-30.txt
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 0 unique sections:
INFO:__main__:Found table of contents (1367 chars)
INFO:__main__:Extracted 9 sections from table of contents:
INFO:__main__:  • 00: $...
INFO:__main__:  • 04: $...
INFO:__main__:


Table-wrapped Parts: 15 matches

  1: [TABLE_START] California | 94-2404110 | (State or other jurisdiction | of incorporation or organizat...

  2: [TABLE_START] Periods | Total Number | of Shares Purchased | Average Price | Paid Per Share | Total ...

  3: [TABLE_START] September 2015 | September 2016 | September 2017 | September 2018 | September 2019 | S...

🚀 SEC Filing Preprocessing Strategy - Ready for Testing!

Key improvements over original approach:

✅ Multi-strategy section detection with fallbacks

✅ Sentence-aware chunking with overlap

✅ Robust error handling and logging

✅ Structured data classes for better organization

✅ Quality validation and statistics

✅ Separate table and narrative processing

🧪 Testing with: processed_filings/AAPL/AAPL_10K_2020-10-30.txt

📊 Processing Results:

  total_chunks: 172

  avg_tokens: 379.86046511627904

  min_tokens: 38

  max_tokens: 1692

  chunks_with_overlap: 105

  table_chunks: 66

  narrative_chunks: 106

  unique_sections: 1


📝

INFO:__main__:Created 262 chunks for AAPL_10K_2020-10-30.txt
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 0 unique sections:
INFO:__main__:Found table of contents (1367 chars)
INFO:__main__:Extracted 9 sections from table of contents:
INFO:__main__:  • 00: $...
INFO:__main__:  • 04: $...
INFO:__main__:  • 18: $...
INFO:__main__:  • 26: $...
INFO:__main__:  • 37: $...
INFO:__main__:  • 40: $...
INFO:__main__:  • 56: $...
INFO:__main__:  • 58: $...
INFO:__main__:  • 68: $...
INFO:__main__:TOC analysis found 9 potential sections. Attempting to extract content based on TOC titles.
INFO:__main__:Found 1 sections in AAPL_10K_2020-10-30.txt
INFO:__main__:Created 172 chunks for AAPL_10K_2020-10-30.txt
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 0 unique sections:
INFO:__main__:Found table of contents (1367 chars)
INFO:__main__:Extracted 9 sections from table of contents:
INFO:__

  Total chunks: 262

  Avg tokens: 273.5

  Overlap rate: 195/262


🧪 Testing: Medium chunks, medium overlap

  Total chunks: 172

  Avg tokens: 379.9

  Overlap rate: 105/172


🧪 Testing: Large chunks, high overlap

  Total chunks: 127

  Avg tokens: 495.8

  Overlap rate: 60/127

🛡️ Testing Error Handling

Test 1: Non-existent file

  Result: 0 chunks (expected 0)


Test 2: Empty content

  Result: 1 sections


Test 3: Malformed filename

  Result: 0 chunks (expected 0)


Test 4: Very short text

  Result: 0 chunks

🔄 Testing Batch Processing (max 3 files)

Processing 3 files...

  1/3: AMZN_10Q_2022-04-29.txt



INFO:__main__:Found 1 sections in AMZN_10Q_2022-04-29.txt
INFO:__main__:Created 125 chunks for AMZN_10Q_2022-04-29.txt
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 0 unique sections:
INFO:__main__:Found table of contents (901 chars)
INFO:__main__:Extracted 22 sections from table of contents:
INFO:__main__:  • 1: ...
INFO:__main__:  • 1: Financial Statements...
INFO:__main__:  • 1: Legal Proceedings...
INFO:__main__:  • 1A: ...
INFO:__main__:  • 1A: Risk Factors...
INFO:__main__:  • 2: ...
INFO:__main__:  • 2: Management’s Discussion and Analysis of Financial ...
INFO:__main__:  • 2: Unregistered Sales of Equity Securities and Use of...
INFO:__main__:  • 3: ...
INFO:__main__:  • 3: Consolidated Statements of Operations...
INFO:__main__:TOC analysis found 22 potential sections. Attempting to extract content based on TOC titles.
INFO:__main__:Found 1 sections in AMZN_10Q_2020-05-01.txt
INFO:__main__:Created 195 chunks for AMZN_10Q_

  2/3: AMZN_10Q_2020-05-01.txt

  3/3: AMZN_10Q_2020-10-30.txt


📊 Batch Processing Summary:

  Total files processed: 3

  Total chunks created: 440

  Average chunks per file: 146.7


📋 Per-file results:

  AMZN_10Q_2022-04-29.txt: 125 chunks, 1 sections, 51 tables

  AMZN_10Q_2020-05-01.txt: 195 chunks, 1 sections, 131 tables

  AMZN_10Q_2020-10-30.txt: 120 chunks, 1 sections, 48 tables

📈 Final Analysis Summary

🎯 Key Insights:

  • Document: AAPL 10K (FY2020)

  • Total chunks: 172

  • Average chunk size: 380 tokens

  • Size range: 38 - 1692 tokens

  • Overlap rate: 61.0%


📊 Chunk Distribution by Type:

  • narrative: 106 chunks (61.6%)

  • table: 66 chunks (38.4%)


📚 Section Breakdown:

  • Full Document: 172 chunks


✅ Quality Metrics:

  • Very small chunks (<50 tokens): 2 (1.2%)

  • Large chunks (>800 tokens): 3 (1.7%)

  • Unique sections identified: 1


🔍 Sample Chunks for Review:


  TABLE example (58 tokens):

    Section: Full Document

    Preview: California | 94

INFO:__main__:Created 172 chunks for AAPL_10K_2020-10-30.txt
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 21 unique sections:
INFO:__main__:  1: Item/Part I - [TABLE_START]...
INFO:__main__:  2: Item/Part 1A - Risk Factors...
INFO:__main__:  3: Item/Part 1B - Unresolved Staff Comments...
INFO:__main__:  4: Item/Part 2 - Properties...
INFO:__main__:  5: Item/Part 3 - Legal Proceedings...
INFO:__main__:  6: Item/Part 4 - Mine Safety Disclosures...
INFO:__main__:  7: Item/Part II - [TABLE_START]...
INFO:__main__:  8: Item/Part 6 - Reserved...
INFO:__main__:  9: Item/Part 7A - Quantitative and Qualitative Disclosures About Market Risk...
INFO:__main__:  10: Item/Part 8 - Financial Statements and Supplementary Data...
INFO:__main__:  11: Item/Part unknown - Legal Proceedings...
INFO:__main__:  12: Item/Part 9 - Changes in and Disagreements with Accountants On Accounting ...
INFO:__main__:  13: Item/Part 9A - Controls and Procedures..


📊 Processing Results:

  total_chunks: 172

  avg_tokens: 379.86046511627904

  min_tokens: 38

  max_tokens: 1692

  chunks_with_overlap: 105

  table_chunks: 66

  narrative_chunks: 106

  unique_sections: 1


📚 Section Distribution (sample):

  • Full Document: 20 chunks


🧪 Testing: processed_filings/AMZN/AMZN_10K_2023-02-03.txt


✅ Found 21 sections:

  1. [TABLE_START]

     Type: part, Length: 13,293 chars

  2. Risk Factors

     Type: item, Length: 55,960 chars

  3. Unresolved Staff Comments

     Type: item, Length: 106 chars

  4. Properties

     Type: item, Length: 1,437 chars

  5. Legal Proceedings

     Type: item, Length: 185 chars

  6. Mine Safety Disclosures

     Type: item, Length: 113 chars

  7. [TABLE_START]

     Type: part, Length: 516 chars

  8. Reserved

     Type: item, Length: 50,497 chars

  9. Quantitative and Qualitative Disclosures About Market Risk

     Type: item, Length: 6,524 chars

  10. Financial Statements and Supplementary Data

     Type:

INFO:__main__:Found 1 sections in AMZN_10Q_2024-11-01.txt
INFO:__main__:Created 132 chunks for AMZN_10Q_2024-11-01.txt
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 8 unique sections:
INFO:__main__:  1: Item/Part I - . Financial Information...
INFO:__main__:  2: Item/Part 2 - Management's Discussion and Analysis of Financial Condition ...
INFO:__main__:  3: Item/Part 3 - Quantitative and Qualitative Disclosures About Market Risk...
INFO:__main__:  4: Item/Part 4 - Controls and Procedures...
INFO:__main__:  5: Item/Part II - . Other Information...
INFO:__main__:  6: Item/Part 1A - Risk Factors...
INFO:__main__:  7: Item/Part 2 - Unregistered Sales of Equity Securities and Use of Proceeds...
INFO:__main__:  8: Item/Part 6 - Exhibits...
INFO:__main__:Universal detection successful (Strategy 1): Found 8 sections.
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 0 unique sections:



📊 Processing Results:

  total_chunks: 132

  avg_tokens: 366.43939393939394

  min_tokens: 7

  max_tokens: 1548

  chunks_with_overlap: 81

  table_chunks: 50

  narrative_chunks: 82

  unique_sections: 1


📚 Section Distribution (sample):

  • Full Document: 20 chunks


🧪 Testing: processed_filings/KO/KO_10Q_2020-07-22.txt


✅ Found 8 sections:

  1. . Financial Information

     Type: part, Length: 115,924 chars

  2. Management's Discussion and Analysis of Financial Condition and Results of Operations

     Type: item, Length: 87,923 chars

  3. Quantitative and Qualitative Disclosures About Market Risk

     Type: item, Length: 207 chars

  4. Controls and Procedures

     Type: item, Length: 1,004 chars

  5. . Other Information

     Type: part, Length: 248 chars

  6. Risk Factors

     Type: item, Length: 11,661 chars

  7. Unregistered Sales of Equity Securities and Use of Proceeds

     Type: item, Length: 2,127 chars

  8. Exhibits

     Type: item, Length: 13,918 chars



In [3]:
def create_section_info(section: DocumentSection, form_type: str) -> str:
    """
    Create human-readable section information for DocumentSection objects,
    using form_type to select the correct item name map.
    """
    item_number = section.item_number
    section_type = section.section_type
    part_number = section.part # Get part from DocumentSection, e.g., "PART I", "PART II"

    if section_type == 'item' and item_number:
        if form_type == '10K':
            item_name = ITEM_NAME_MAP_10K.get(item_number, "Unknown Section")
            return f"Item {item_number} - {item_name}"
        elif form_type == '10Q':
            # Use part_number from DocumentSection to decide which 10Q map to use
            if part_number == 'PART I':
                item_name = ITEM_NAME_MAP_10Q_PART_I.get(item_number, "Unknown Section")
                return f"Part I, Item {item_number} - {item_name}"
            elif part_number == 'PART II':
                item_name = ITEM_NAME_MAP_10Q_PART_II.get(item_number, "Unknown Section")
                return f"Part II, Item {item_number} - {item_name}"
            else:
                # Fallback if part not clearly identified, try both maps
                if item_number in ITEM_NAME_MAP_10Q_PART_I:
                    item_name = ITEM_NAME_MAP_10Q_PART_I[item_number]
                    return f"Part I, Item {item_number} - {item_name}"
                elif item_number in ITEM_NAME_MAP_10Q_PART_II:
                    item_name = ITEM_NAME_MAP_10Q_PART_II[item_number]
                    return f"Part II, Item {item_number} - {item_name}"
                return f"Item {item_number} - Unknown 10Q Section"
    
    elif section_type == 'part' and part_number:
        # If it's a PART section, check if it also includes an item title, as some PARTs have "PART I. FINANCIAL INFORMATION"
        if "Item" in section.title and section.item_number:
            # This handles cases like "PART I - Item 1. Financial Statements" if detect_sections_universal_sec captures it that way
            return f"{part_number} - {section.title.replace(part_number, '').strip(' -.')}"
        return part_number # Just return "PART I", "PART II" etc.

    # Fallback for named_section, content, or document type sections
    return section.title or "Document Content"


def detect_sections_universal_sec(content: str) -> List[DocumentSection]:
    """
    Universal section detection for SEC filings with table-based formatting.
    Improved regex patterns for better capture of Item/Part numbers and titles.
    """
    sections = []

    if not content:
        logger.info("Empty content provided to detect_sections_universal_sec. Returning empty sections.")
        return sections

    # Universal patterns for table-formatted SEC filings
    # Using raw strings `r` and explicitly handling whitespace `\s*` and literal characters.
    # Compiling patterns once for efficiency.
    patterns = [
        # Table-based ITEM patterns with variable whitespace and optional period after item number
        re.compile(r'(?i)\[TABLE_START\]\s*Item\s*(\d{1,2}[A-C]?)\.?\s*\|\s*([^\[]+?)\s*\[TABLE_END\]', re.DOTALL),
        re.compile(r'(?i)\[TABLE_START\]\s*Item\s*(\d{1,2}[A-C]?)\.?\s*\|\s*([^|]+)', re.DOTALL),

        # Table-based PART patterns with variable whitespace
        re.compile(r'(?i)\[TABLE_START\]\s*PART\s*([IVX]+)\s*\|\s*([^\[]+?)\s*\[TABLE_END\]', re.DOTALL),
        re.compile(r'(?i)\[TABLE_START\]\s*PART\s*([IVX]+)\s*\|\s*([^|]+)', re.DOTALL),
        re.compile(r'(?i)\[TABLE_START\]\s*PART\s*([IVX]+)\s*\[TABLE_END\]', re.DOTALL),

        # Standalone ITEM patterns (strong indicators, start of line)
        re.compile(r'^\s*Item\s*(\d{1,2}[A-C]?)\.?\s*([^\n]+)', re.I | re.M),
        # Standalone ITEM patterns (pipe-separated but not necessarily table-wrapped)
        re.compile(r'Item\s*(\d{1,2}[A-C]?)\.?\s*\|\s*([^|]+)', re.I | re.DOTALL),

        # Standalone PART patterns (strong indicators, start of line)
        re.compile(r'^\s*PART\s*([IVX]+)\s*([^\n]*)', re.I | re.M),
        # Standalone PART patterns (pipe-separated)
        re.compile(r'PART\s*([IVX]+)\s*\|\s*([^|]+)', re.I | re.DOTALL),

        # Number-dot format (e.g., "1. Business" not necessarily preceded by "Item")
        re.compile(r'^\s*(\d{1,2}[A-C]?)\.\s+[A-Z][A-Za-z\s]{10,}', re.I | re.M),
        # Number-only pattern in tables (e.g. "[TABLE_START] 1. | Business")
        re.compile(r'(?i)\[TABLE_START\]\s*(\d{1,2}[A-C]?)\.?\s*\|\s*([^|]+)', re.DOTALL),

        # Generic Section Titles that often appear as headers
        re.compile(r'^\s*(BUSINESS|RISK FACTORS|LEGAL PROCEEDINGS|FINANCIAL STATEMENTS|MANAGEMENT\'S DISCUSSION AND ANALYSIS|PROPERTIES|CONTROLS AND PROCEDURES)\s*$', re.I | re.M)
    ]

    all_matches = []

    for pattern_idx, pattern in enumerate(patterns):
        for match in pattern.finditer(content):
            # Determine content boundaries for the "line" containing the match
            line_start = content.rfind('\n', 0, match.start()) + 1
            line_end = content.find('\n', match.end())
            if line_end == -1:
                line_end = len(content)

            full_line = content[line_start:line_end].strip()

            # Filter out obvious false positives
            if (len(full_line) > 400 or  # Too long to be a header
                len(full_line) < 3 or    # Too short (e.g., "1.")
                full_line.count(' ') > 20):  # Too many words, likely not a header
                continue

            # Heuristic to filter out TOC entries that might match general patterns
            if any(toc_indicator in full_line.lower() for toc_indicator in ['table of contents', 'index']):
                continue
            
            # Extract section identifier and title more carefully
            section_id = None
            section_title = full_line # Default to full line

            groups = match.groups()
            if len(groups) > 0:
                potential_id = groups[0].strip()
                # Check if it's an Item/Part ID based on common patterns (e.g., "1", "1A", "I", "II")
                if re.match(r'^\d+[A-C]?$', potential_id, re.I) or re.match(r'^[IVX]+$', potential_id, re.I):
                    section_id = potential_id
                    if len(groups) > 1 and groups[1]: # If a title group was also captured
                        section_title = groups[1].strip()
                        section_title = re.sub(re.escape('[TABLE_END]') + r'.*', '', section_title, flags=re.I).strip()
                        section_title = section_title.replace('|', '').strip()
                    elif 'Item' in full_line or 'PART' in full_line:
                        # Extract title after "Item X." or "PART X"
                        clean_line = re.sub(r'^\s*(Item|PART)\s*\d*[A-C]*[IVX]*\.?\s*[-–—]?\s*', '', full_line, flags=re.I).strip()
                        if clean_line and len(clean_line) < 200: # Ensure extracted title isn't too long
                            section_title = clean_line
                        else: # Fallback if clean_line is too long or empty
                             section_title = full_line # Still use full line as title if too complex
                else: # If the first group was not an ID, treat as generic title
                    section_title = full_line
                    # Attempt to extract ID if it's a known named section (e.g., "BUSINESS")
                    if 'BUSINESS' in full_line.upper(): section_id = '1'
                    elif 'RISK FACTORS' in full_line.upper(): section_id = '1A'
                    # Add other named section mappings if needed. These will typically be caught by the direct regex anyway.

            # Store the original start/end of the line for correct content extraction
            all_matches.append({
                'start_pos': line_start,
                'end_pos': line_end,
                'full_line': full_line,
                'section_id': section_id if section_id else 'unknown', # Default to 'unknown' if no ID found
                'section_title': section_title,
                'pattern_idx': pattern_idx,
                'match_start': match.start() # Keep for internal sorting preference
            })

    # Sort matches primarily by start_pos, secondarily by pattern_idx (to prefer more specific patterns)
    all_matches.sort(key=lambda x: (x['start_pos'], x['pattern_idx']))

    # Filter duplicate/overlapping matches. Prioritize more specific patterns (lower pattern_idx).
    final_matches = []
    if all_matches:
        final_matches.append(all_matches[0])
        for i in range(1, len(all_matches)):
            current_match = all_matches[i]
            last_added_match = final_matches[-1]

            # If current match starts very close to the last added match,
            # consider if it's a duplicate or a better alternative.
            if current_match['start_pos'] - last_added_match['start_pos'] < 100: # Within 100 chars
                # Prefer matches with a specific Item/Part ID over 'unknown' or 'content'
                if current_match['section_id'] != 'unknown' and last_added_match['section_id'] == 'unknown':
                    final_matches[-1] = current_match
                # If same ID (e.g., multiple "Item 1" mentions), keep the earliest one unless a stronger pattern comes up
                elif current_match['section_id'] == last_added_match['section_id'] and current_match['pattern_idx'] < last_added_match['pattern_idx']:
                    final_matches[-1] = current_match # Replace with higher priority pattern
                # Otherwise, if it's too close and not a better candidate, skip as duplicate
            else:
                final_matches.append(current_match) # Add if sufficiently far apart

    logger.info(f"🔍 Universal SEC detection found {len(final_matches)} unique sections:")
    for i, match in enumerate(final_matches[:15]):
        logger.info(f"  {i+1}: Item/Part {match['section_id']} - {match['section_title'][:60]}...")

    # Convert to DocumentSection objects
    final_document_sections = []
    current_part = None # Track current part for 10Q item context

    for i, match in enumerate(final_matches):
        start_pos = match['start_pos']
        # End position is the start of the next matched section, or end of content if it's the last one
        end_pos = final_matches[i + 1]['start_pos'] if i + 1 < len(final_matches) else len(content)

        section_content = content[start_pos:end_pos].strip()

        section_id = match['section_id'].upper()
        title = match['section_title']

        section_type = 'content' # Default type
        item_number = None
        part = None

        if re.match(r'^[IVX]+$', section_id):
            section_type = 'part'
            part = f"PART {section_id}"
            current_part = part # Update current part for subsequent items
            # Refine title to be just the part if it's a generic capture
            if title.upper().startswith("PART ") and title.upper().replace("PART ", "").strip() == section_id:
                title = part
            elif not title:
                title = part
        elif re.match(r'^\d+[A-C]?$', section_id):
            section_type = 'item'
            item_number = section_id
            part = current_part # Assign current part context to this item
            # Refine title to be just the item if it's a generic capture
            if title.upper().startswith("ITEM ") and title.upper().replace("ITEM ", "").strip() == section_id:
                title = f"Item {item_number}"
            elif not title:
                title = f"Item {item_number}"
        # For named_section, title is already the full_line or specific keyword match
        elif any(keyword in title.upper() for keyword in ['BUSINESS', 'RISK FACTORS', 'LEGAL PROCEEDINGS', 'FINANCIAL STATEMENTS', 'MANAGEMENT\'S DISCUSSION', 'PROPERTIES', 'CONTROLS AND PROCEDURES']):
            section_type = 'named_section'


        final_document_sections.append(DocumentSection(
            title=title,
            content=section_content,
            section_type=section_type,
            item_number=item_number,
            part=part, # Store the part info (either detected directly or inherited)
            start_pos=start_pos,
            end_pos=end_pos
        ))

    return final_document_sections

def detect_sections_from_toc_universal(content: str) -> List[DocumentSection]:
    """
    Extract sections from table of contents - works for any SEC filing.
    This function primarily identifies section titles and item numbers from TOC,
    but does not extract their content directly.
    """
    sections = []

    if not content:
        logger.info("Empty content provided to detect_sections_from_toc_universal. Returning empty sections.")
        return sections

    # Look for table of contents patterns. Using re.escape for literal parts.
    toc_patterns = [
        re.compile(r'(?i)INDEX.*?(?=\s*--- PAGE BREAK ---)', re.DOTALL),
        re.compile(r'(?i)TABLE OF CONTENTS.*?(?=\s*--- PAGE BREAK ---)', re.DOTALL),
        re.compile(r'(?i)FORM 10-[KQ].*?INDEX.*?(?=\s*--- PAGE BREAK ---)', re.DOTALL),
        re.compile(re.escape('[TABLE_START]') + r'.*?Page.*?' + re.escape('[TABLE_END]') + r'.*?(?=\s*--- PAGE BREAK ---)', re.DOTALL),
    ]

    toc_content = ""
    for pattern in toc_patterns:
        match = pattern.search(content)
        if match:
            toc_content = match.group(0)
            break

    if not toc_content:
        logger.warning("No table of contents found in detect_sections_from_toc_universal.")
        return sections

    logger.info(f"Found table of contents ({len(toc_content)} chars)")

    # Define patterns for items/parts within the TOC
    # CORRECTED: Relaxed whitespace and optional period for item numbers.
    # Also made "Item" and "PART" literal words, not regex metacharacters.
    item_patterns = [
        # Example: "Item 1. Financial Statements | 3"
        re.compile(r'(?i)Item\s*(\d{1,2}[A-C]?)\.?\s*\|\s*([^|]+?)\s*\|\s*\d+', re.DOTALL),
        # Example: "PART I | FINANCIAL INFORMATION"
        re.compile(r'(?i)PART\s*([IVX]+)\s*\|\s*([^|]+)', re.DOTALL),
        # Example: "Item 1A. Risk Factors" (not in table, without page number)
        re.compile(r'(?i)Item\s*(\d{1,2}[A-C]?)\.?\s*([^\n|]+)', re.M),
        # Example: "1. | Financial Statements | 3" (starting with number, in table)
        re.compile(r'(?i)(\d{1,2}[A-C]?)\.?\s*\|\s*([^|]+?)\s*\|\s*\d+', re.DOTALL),
        # Example: "PART II" (simple part declaration)
        re.compile(r'(?i)PART\s*([IVX]+)', re.M)
    ]

    found_items = []
    # Only try to find items if TOC content was found
    if toc_content:
        for pattern in item_patterns:
            for match in pattern.finditer(toc_content):
                groups = match.groups()
                item_id = None
                item_title = ""

                if len(groups) >= 2: # Pattern captured both ID and Title
                    item_id = groups[0].strip()
                    item_title = groups[1].strip()
                elif len(groups) == 1: # Pattern only captured ID
                    item_id = groups[0].strip()
                    # Attempt to get text immediately following the ID match in the TOC line
                    line_remainder_start = match.end()
                    line_end_of_match = toc_content.find('\n', line_remainder_start)
                    if line_end_of_match == -1:
                        line_end_of_match = len(toc_content)
                    
                    potential_title_from_line = toc_content[line_remainder_start:line_end_of_match].strip()
                    if potential_title_from_line:
                        item_title = potential_title_from_line
                    else:
                        item_title = f"Section {item_id}" # Fallback generic title

                if item_id: # Ensure an ID was captured
                    item_title = re.sub(r'\s+', ' ', item_title).strip() # Normalize whitespace
                    found_items.append((item_id, item_title))

    unique_items = []
    seen = set()
    # Sort found items by their ID for more consistent processing, then by title for tie-breaking
    found_items.sort(key=lambda x: (x[0], x[1]))

    for item_id, title in found_items:
        # Create a unique key for deduplication, focusing on ID and a portion of title
        key = f"{item_id}_{title[:50]}"
        if key not in seen:
            unique_items.append((item_id, title))
            seen.add(key)

    logger.info(f"Extracted {len(unique_items)} sections from table of contents:")
    for item_id, title in unique_items[:10]:
        logger.info(f"  • {item_id}: {title[:50]}...")

    toc_sections = []
    current_part = None # Track current part for items found in TOC

    for item_id, title in unique_items:
        section_type = 'unknown'
        item_number = None
        part_num = None # Initial value

        if re.match(r'^\d+[A-C]?$', item_id):
            section_type = 'item'
            item_number = item_id
            part_num = current_part # Assign the last seen part to this item
        elif re.match(r'^[IVX]+$', item_id):
            section_type = 'part'
            part_num = f"PART {item_id}"
            current_part = part_num # Update the current part context
        else:
            section_type = 'content' # Treat as generic content section

        toc_sections.append(DocumentSection(
            title=title,
            content="", # Content is intentionally empty here; will be filled by main sectioning if this strategy is chosen.
            section_type=section_type,
            item_number=item_number,
            part=part_num # Store the identified part (either detected or inherited)
        ))
    return toc_sections


def detect_sections_robust_universal(content: str) -> List[DocumentSection]:
    """
    Universal robust section detection for all SEC filings.
    Prioritizes direct pattern matching (which handles tables well), then TOC, then page-based.
    """
    logger.info("Attempting universal SEC section detection")

    # Strategy 1: Direct pattern matching for sections (designed to work well with common SEC patterns)
    sections_strategy1 = detect_sections_universal_sec(content)

    if len(sections_strategy1) >= 3:
        logger.info(f"Universal detection successful (Strategy 1): Found {len(sections_strategy1)} sections.")
        return sections_strategy1

    # Strategy 2: Try parsing Table of Contents.
    logger.warning("Direct detection found few sections, analyzing table of contents.")
    toc_entries = detect_sections_from_toc_universal(content) # These are DocumentSections with only title/metadata, no content

    if toc_entries and len(toc_entries) >= 3: # If TOC parsing yielded a good number of entries
        logger.info(f"TOC analysis found {len(toc_entries)} potential sections. Attempting to extract content based on TOC titles.")

        combined_sections = []
        current_content_pos = 0

        # Sort toc_entries by their expected appearance in the document if they don't have start_pos
        # This is crucial for iterating and finding them correctly in the content.
        # If TOC parsing doesn't give start_pos, rely on the sequence.
        # If TOC parsing gives parts/items, sort by those.
        toc_entries_sorted = sorted(toc_entries, key=lambda x: (x.part if x.part else '', x.item_number if x.item_number else '', x.title))


        for i, toc_entry in enumerate(toc_entries_sorted): # Iterate through sorted TOC entries
            # Create flexible regex for the title/item number to find it in the main content
            pattern_parts = []
            if toc_entry.item_number:
                # Be flexible about "Item" prefix and trailing period
                pattern_parts.append(r'Item\s*' + re.escape(toc_entry.item_number) + r'\.?')
            if toc_entry.part:
                # Be flexible about "PART" prefix
                pattern_parts.append(r'PART\s*' + re.escape(toc_entry.part.replace("PART ", "")))
            
            # Use the full title as a fallback if item/part number is not explicit or title is more descriptive
            if toc_entry.title:
                # Ensure escaped title allows for flexible whitespace in the content
                pattern_parts.append(re.escape(toc_entry.title).replace('\\ ', '\\s*'))

            if not pattern_parts: # Skip if no pattern can be formed for this TOC entry
                continue

            # Combine all potential ways to match this section's header
            search_pattern = re.compile(r'(?i)^\s*(?:' + '|'.join(pattern_parts) + r')', re.M)
            
            # Search from current_content_pos to ensure sequential parsing
            match = search_pattern.search(content, pos=current_content_pos)

            if match:
                start_pos = match.start()
                
                # The content for this section goes until the start of the next TOC entry, or end of document
                next_start_pos = len(content)
                if i + 1 < len(toc_entries_sorted): # Check the next entry in the *sorted* list
                    next_toc_entry = toc_entries_sorted[i+1]
                    next_pattern_parts = []
                    if next_toc_entry.item_number:
                        next_pattern_parts.append(r'Item\s*' + re.escape(next_toc_entry.item_number) + r'\.?')
                    if next_toc_entry.part:
                        next_pattern_parts.append(r'PART\s*' + re.escape(next_toc_entry.part.replace("PART ", "")))
                    if next_toc_entry.title:
                        next_pattern_parts.append(re.escape(next_toc_entry.title).replace('\\ ', '\\s*'))

                    if next_pattern_parts:
                        next_pattern = re.compile(r'(?i)^\s*(?:' + '|'.join(next_pattern_parts) + r')', re.M)
                        # Search for the next section starting *after* the current match's end
                        next_match = next_pattern.search(content, pos=match.end())
                        if next_match:
                            next_start_pos = next_match.start()
                
                section_content = content[start_pos:next_start_pos].strip()
                
                combined_sections.append(DocumentSection(
                    title=toc_entry.title,
                    content=section_content,
                    section_type=toc_entry.section_type,
                    item_number=toc_entry.item_number,
                    part=toc_entry.part, # Preserve the part info derived from TOC
                    start_pos=start_pos,
                    end_pos=next_start_pos
                ))
                current_content_pos = next_start_pos
            else:
                logger.warning(f"Could not find content for TOC entry: '{toc_entry.title}'. This section might be merged with previous or skipped.")
                # If a TOC entry is not found, its content might be part of the previous section,
                # or it's a false positive in the TOC. For simplicity, we just move on.

        if len(combined_sections) >= 3:
            logger.info(f"Universal detection successful (TOC-based content mapping): Found {len(combined_sections)} sections.")
            return combined_sections
        else:
            logger.warning("TOC-based content mapping yielded few sections. Falling back to page-based detection.")


    # Strategy 3: Page-based fallback (original strategy 2)
    logger.warning("Trying page-based detection as fallback.")
    sections_strategy2 = detect_sections_strategy_2(content)

    if len(sections_strategy2) >= 2:
        logger.info(f"Page-based detection successful: Found {len(sections_strategy2)} sections.")
        return sections_strategy2

    # Final fallback: return the entire document as a single section
    logger.warning("All strategies failed, creating single section.")
    return [DocumentSection(
        title="Full Document",
        content=content,
        section_type='document',
        start_pos=0,
        end_pos=len(content)
    )]


# Helper function to extract metadata from filename
def extract_metadata_from_filename(file_path: str) -> FilingMetadata:
    filename = Path(file_path).name
    file_id = filename.replace(".txt", "")
    parts = file_id.split('_')

    if len(parts) != 3:
        logger.warning(f"Malformed filename: {filename}. Using default metadata.")
        return FilingMetadata(
            ticker="UNKNOWN",
            form_type="UNKNOWN",
            filing_date="1900-01-01",
            fiscal_year=1900,
            fiscal_quarter=1,
            file_path=file_path
        )

    ticker, form_type, filing_date_str = parts

    try:
        filing_date = pd.to_datetime(filing_date_str)
        fiscal_year = filing_date.year
        fiscal_quarter = filing_date.quarter
    except pd.errors.ParserError:
        logger.error(f"Could not parse filing date from {filing_date_str} in {filename}. Using default values.")
        fiscal_year = 1900
        fiscal_quarter = 1

    # Adjust fiscal year for 10-K filings if the filing date is early in the calendar year
    # and typically refers to the previous fiscal year end.
    if form_type == '10K' and filing_date.month <= 3: # Assuming fiscal year ends typically in Dec or Jan-Mar for previous year
        fiscal_year -= 1 # Often a 10K filed in Jan-Mar of current year is for previous fiscal year

    return FilingMetadata(
        ticker=ticker,
        form_type=form_type,
        filing_date=filing_date_str,
        fiscal_year=fiscal_year,
        fiscal_quarter=fiscal_quarter,
        file_path=file_path
    )


# =============================================================================
# MAIN PROCESSING FUNCTION (Universal)
# =============================================================================
def process_filing_robust_universal(file_path: str, target_tokens: int = 500, overlap_tokens: int = 100) -> List[Chunk]:
    """
    Universal processing function for all SEC filings
    """
    try:
        # Extract filing metadata
        filing_metadata = extract_metadata_from_filename(file_path)
        filename = Path(file_path).name # For logging clarity
        file_id = filename.replace(".txt", "") # For chunk_id creation

        # Read and clean content
        with open(file_path, 'r', encoding='utf-8') as f:
            raw_content = f.read()
        cleaned_content = clean_sec_text(raw_content)

        # Basic check for empty content after cleaning
        if not cleaned_content.strip():
            logger.warning(f"Cleaned content for {filename} is empty. No chunks created.")
            return []

        # Use universal section detection
        sections = detect_sections_robust_universal(cleaned_content)
        logger.info(f"Found {len(sections)} sections in {filename}")

        # Process each section
        all_chunks = []
        chunk_counter = 0

        for section in sections:
            # Ensure section.content is not empty before processing
            if not section.content.strip():
                continue # Skip empty sections

            # Extract tables and narrative from this section's content
            tables_in_section, narrative_content_in_section = extract_and_process_tables(section.content)

            # Create section info string using the original create_section_info
            section_info = create_section_info(section, filing_metadata.form_type)

            # Process tables found within this section
            for table in tables_in_section:
                chunk = Chunk(
                    chunk_id=f"{file_id}-chunk-{chunk_counter:04d}",
                    text=table['text'],
                    token_count=table['token_count'],
                    chunk_type='table',
                    section_info=section_info,
                    filing_metadata=filing_metadata,
                    chunk_index=chunk_counter,
                    has_overlap=False
                )
                all_chunks.append(chunk)
                chunk_counter += 1

            # Process narrative content from this section
            if narrative_content_in_section.strip():
                # Use the existing create_overlapping_chunks for narrative
                narrative_sub_chunks = create_overlapping_chunks(
                    narrative_content_in_section, target_tokens, overlap_tokens
                )

                for chunk_data in narrative_sub_chunks:
                    chunk = Chunk(
                        chunk_id=f"{file_id}-chunk-{chunk_counter:04d}",
                        text=chunk_data['text'],
                        token_count=chunk_data['token_count'],
                        chunk_type='narrative',
                        section_info=section_info,
                        filing_metadata=filing_metadata,
                        chunk_index=chunk_counter,
                        has_overlap=chunk_data['has_overlap']
                    )
                    all_chunks.append(chunk)
                    chunk_counter += 1

        logger.info(f"Created {len(all_chunks)} chunks for {filename}")
        return all_chunks

    except Exception as e:
        logger.error(f"Error processing {file_path}: {e}")
        return []

# =============================================================================
# 5. IMPROVED SENTENCE-AWARE CHUNKING
# =============================================================================

def split_into_sentences(text: str) -> List[str]:
    """
    Split text into sentences using multiple heuristics
    """
    # Simple sentence splitting (can be improved with spaCy/NLTK)
    sentences = re.split(r'(?<=[.!?])\s+(?=[A-Z])', text)

    # Clean up sentences
    sentences = [s.strip() for s in sentences if s.strip()]

    return sentences

def create_overlapping_chunks(text: str, target_tokens: int = 500, overlap_tokens: int = 100,
                            min_tokens: int = 50) -> List[Dict[str, Any]]:
    """
    Create semantically aware chunks with overlap
    """
    sentences = split_into_sentences(text)
    chunks = []

    current_chunk_sentences = []
    current_tokens = 0

    for i, sentence in enumerate(sentences):
        sentence_tokens = len(encoding.encode(sentence))

        # If adding this sentence exceeds target, finalize current chunk
        if current_tokens + sentence_tokens > target_tokens and current_chunk_sentences:
            chunk_text = ' '.join(current_chunk_sentences)
            chunks.append({
                'text': chunk_text,
                'token_count': current_tokens,
                'sentence_count': len(current_chunk_sentences),
                'has_overlap': len(chunks) > 0
            })

            # Create overlap: keep last few sentences
            overlap_sentences = []
            current_overlap_tokens = 0

            for sent_idx in range(len(current_chunk_sentences) - 1, -1, -1):
                sent = current_chunk_sentences[sent_idx]
                sent_tokens = len(encoding.encode(sent))
                if current_overlap_tokens + sent_tokens <= overlap_tokens:
                    overlap_sentences.insert(0, sent)
                    current_overlap_tokens += sent_tokens
                else:
                    break
            
            if not overlap_sentences and current_chunk_sentences:
                overlap_sentences = [current_chunk_sentences[-1]]
                current_overlap_tokens = len(encoding.encode(overlap_sentences[0]))


            current_chunk_sentences = overlap_sentences + [sentence]
            current_tokens = current_overlap_tokens + sentence_tokens
        else:
            current_chunk_sentences.append(sentence)
            current_tokens += sentence_tokens

    if current_chunk_sentences:
        chunk_text = ' '.join(current_chunk_sentences)
        final_tokens = len(encoding.encode(chunk_text))

        if final_tokens >= min_tokens:
            chunks.append({
                'text': chunk_text,
                'token_count': final_tokens,
                'sentence_count': len(current_chunk_sentences),
                'has_overlap': len(chunks) > 0
            })

    return chunks

# =============================================================================
# 6. TABLE HANDLING
# =============================================================================

def extract_and_process_tables(content: str) -> Tuple[List[Dict], str]:
    """
    Extract tables and return both table chunks and narrative text
    """
    table_pattern = re.compile(r'=== TABLE START ===.*?=== TABLE END ===', re.DOTALL)
    tables = []

    # Find all tables
    for i, match in enumerate(table_pattern.finditer(content)):
        table_content = match.group(0)
        table_text = table_content.replace('=== TABLE START ===', '').replace('=== TABLE END ===', '').strip()

        if table_text:
            tables.append({
                'text': table_text,
                'token_count': len(encoding.encode(table_text)),
                'table_index': i,
                'chunk_type': 'table'
            })

    # Remove tables from content to get narrative text
    narrative_content = table_pattern.sub('', content).strip()

    return tables, narrative_content

# =============================================================================
# 8. TESTING AND VALIDATION
# =============================================================================

def validate_chunks(chunks: List[Chunk]) -> Dict[str, Any]:
    """
    Validate the quality of our chunks
    """
    if not chunks:
        return {"error": "No chunks created"}

    token_counts = [chunk.token_count for chunk in chunks]

    stats = {
        "total_chunks": len(chunks),
        "avg_tokens": sum(token_counts) / len(token_counts),
        "min_tokens": min(token_counts),
        "max_tokens": max(token_counts),
        "chunks_with_overlap": sum(1 for chunk in chunks if chunk.has_overlap),
        "table_chunks": sum(1 for chunk in chunks if chunk.chunk_type == 'table'),
        "narrative_chunks": sum(1 for chunk in chunks if chunk.chunk_type == 'narrative'),
        "unique_sections": len(set(chunk.section_info for chunk in chunks))
    }

    return stats

# =============================================================================
# 9. LET'S TEST THIS!
# =============================================================================

print("🚀 SEC Filing Preprocessing Strategy - Ready for Testing!\n")
print("="*60)
print("Key improvements over original approach:\n")
print("✅ Multi-strategy section detection with fallbacks\n")
print("✅ Sentence-aware chunking with overlap\n")
print("✅ Robust error handling and logging\n")
print("✅ Structured data classes for better organization\n")
print("✅ Quality validation and statistics\n")
print("✅ Separate table and narrative processing\n")
print("="*60)


def test_single_file():
    """Test our preprocessing on a single file"""
    test_file = "processed_filings/AAPL/AAPL_10K_2020-10-30.txt"

    if os.path.exists(test_file):
        print(f"🧪 Testing with: {test_file}\n")
        print("="*50)

        chunks = process_filing_robust_universal(test_file)
        stats = validate_chunks(chunks)

        print("📊 Processing Results:\n")
        for key, value in stats.items():
            print(f"  {key}: {value}\n")

        print("\n📝 Sample Chunks:\n")
        for i, chunk in enumerate(chunks[:3]):
            print(f"\nChunk {i+1} ({chunk.chunk_type}):\n")
            print(f"  Section: {chunk.section_info}\n")
            print(f"  Tokens: {chunk.token_count}\n")
            print(f"  Text preview: {chunk.text[:200]}...\n")

        return chunks
    else:
        print(f"❌ File not found: {test_file}\n")
        print("Please update the file path to match your data structure\n")
        return []

chunks = test_single_file()

def compare_section_strategies(content: str):
    """Compare how different strategies perform"""
    print("🔍 Comparing Section Detection Strategies\n")
    print("="*50)

    sections_1 = detect_sections_strategy_1_improved(content)
    print(f"Strategy 1 (Regex): {len(sections_1)} sections\n")
    for i, section in enumerate(sections_1[:5]):
        print(f"  {i+1}. {section.title[:60]}...\n")

    print()

    sections_2 = detect_sections_strategy_2(content)
    print(f"Strategy 2 (Page-based): {len(sections_2)} sections\n")
    for i, section in enumerate(sections_2[:5]):
        print(f"  {i+1}. {section.title[:60]}...\n")

    return sections_1, sections_2

if chunks:
    test_file = chunks[0].filing_metadata.file_path
    with open(test_file, 'r', encoding='utf-8') as f:
        full_content_for_comparison = f.read()
    cleaned_content_for_comparison = clean_sec_text(full_content_for_comparison)

    sections_1_comp, sections_2_comp = compare_section_strategies(cleaned_content_for_comparison)


def analyze_chunking_quality(chunks: List[Chunk]):
    """Deep dive into chunk quality"""
    if not chunks:
        print("No chunks to analyze\n")
        return

    print("📊 Chunking Quality Analysis\n")
    print("="*50)

    token_counts = [chunk.token_count for chunk in chunks]

    print(f"Token Distribution:\n")
    print(f"  Mean: {sum(token_counts)/len(token_counts):.1f}\n")
    print(f"  Median: {sorted(token_counts)[len(token_counts)//2]}\n")
    print(f"  Min: {min(token_counts)}\n")
    print(f"  Max: {max(token_counts)}\n")

    print(f"\nChunk Types:\n")
    chunk_types = {}
    for chunk in chunks:
        chunk_types[chunk.chunk_type] = chunk_types.get(chunk.chunk_type, 0) + 1
    for chunk_type, count in chunk_types.items():
        print(f"  {chunk_type}: {count}\n")

    print(f"\nSection Distribution:\n")
    sections_dist = {}
    for chunk in chunks:
        sections_dist[chunk.section_info] = sections_dist.get(chunk.section_info, 0) + 1
    for section, count in sorted(sections_dist.items()):
        print(f"  {section}: {count} chunks\n")

    overlap_count = sum(1 for chunk in chunks if chunk.has_overlap)
    print(f"\nOverlap Analysis:\n")
    print(f"  Chunks with overlap: {overlap_count}/{len(chunks)} ({overlap_count/len(chunks)*100:.1f}%)\n")

    return {
        'token_stats': {
            'mean': sum(token_counts)/len(token_counts),
            'median': sorted(token_counts)[len(token_counts)//2],
            'min': min(token_counts),
            'max': max(token_counts)
        },
        'chunk_types': chunk_types,
        'sections': sections_dist,
        'overlap_rate': overlap_count/len(chunks)
    }

if chunks:
    quality_analysis = analyze_chunking_quality(chunks)


def test_chunking_parameters():
    """Test different parameter combinations"""
    if not chunks:
        print("No test file processed yet\n")
        return

    test_file = chunks[0].filing_metadata.file_path

    print("🔧 Testing Different Chunking Parameters\n")
    print("="*50)

    param_configs = [
        {"target_tokens": 300, "overlap_tokens": 50, "name": "Small chunks, low overlap"},
        {"target_tokens": 500, "overlap_tokens": 100, "name": "Medium chunks, medium overlap"},
        {"target_tokens": 800, "overlap_tokens": 150, "name": "Large chunks, high overlap"},
    ]

    results = {}

    for config in param_configs:
        print(f"\n🧪 Testing: {config['name']}\n")
        test_chunks = process_filing_robust_universal(
            test_file,
            target_tokens=config['target_tokens'],
            overlap_tokens=config['overlap_tokens']
        )

        stats = validate_chunks(test_chunks)
        results[config['name']] = stats

        print(f"  Total chunks: {stats['total_chunks']}\n")
        print(f"  Avg tokens: {stats['avg_tokens']:.1f}\n")
        print(f"  Overlap rate: {stats['chunks_with_overlap']}/{stats['total_chunks']}\n")

    return results

param_results = test_chunking_parameters()


def test_error_handling():
    """Test how our system handles various edge cases"""
    print("🛡️ Testing Error Handling\n")
    print("="*50)

    print("Test 1: Non-existent file\n")
    fake_chunks = process_filing_robust_universal("non_existent_file.txt")
    print(f"  Result: {len(fake_chunks)} chunks (expected 0)\n")

    print("\nTest 2: Empty content\n")
    empty_sections = detect_sections_robust_universal("")
    print(f"  Result: {len(empty_sections)} sections\n")

    print("\nTest 3: Malformed filename\n")
    import tempfile
    with tempfile.NamedTemporaryFile(mode='w', suffix='_bad_name.txt', delete=False) as f:
        f.write("Some content")
        temp_file = f.name

    bad_chunks = process_filing_robust_universal(temp_file)
    print(f"  Result: {len(bad_chunks)} chunks (expected 0)\n")

    os.unlink(temp_file)

    print("\nTest 4: Very short text\n")
    short_chunks = create_overlapping_chunks("Short text.", target_tokens=500)
    print(f"  Result: {len(short_chunks)} chunks\n")

test_error_handling()


def test_batch_processing(max_files: int = 5):
    """Test processing multiple files"""
    print(f"🔄 Testing Batch Processing (max {max_files} files)\n")
    print("="*50)

    data_path = "processed_filings/"
    if not os.path.exists(data_path):
        print(f"❌ Data path not found: {data_path}\n")
        return []

    all_files = []
    for root, dirs, files in os.walk(data_path):
        for file in files:
            if file.endswith('.txt'):
                all_files.append(os.path.join(root, file))

    test_files = all_files[:max_files]
    print(f"Processing {len(test_files)} files...\n")

    all_results = []

    for i, file_path in enumerate(test_files):
        print(f"  {i+1}/{len(test_files)}: {os.path.basename(file_path)}\n")

        file_chunks = process_filing_robust_universal(file_path)
        stats = validate_chunks(file_chunks)

        all_results.append({
            'file': os.path.basename(file_path),
            'chunks': len(file_chunks),
            'avg_tokens': stats.get('avg_tokens', 0),
            'sections': stats.get('unique_sections', 0),
            'tables': stats.get('table_chunks', 0)
        })

    print(f"\n📊 Batch Processing Summary:\n")
    total_chunks = sum(r['chunks'] for r in all_results)
    avg_chunks_per_file = total_chunks / len(all_results) if all_results else 0

    print(f"  Total files processed: {len(all_results)}\n")
    print(f"  Total chunks created: {total_chunks}\n")
    print(f"  Average chunks per file: {avg_chunks_per_file:.1f}\n")

    print(f"\n📋 Per-file results:\n")
    for result in all_results:
        print(f"  {result['file']}: {result['chunks']} chunks, {result['sections']} sections, {result['tables']} tables\n")

    return all_results

batch_results = test_batch_processing(max_files=3)


def create_analysis_summary():
    """Create a comprehensive summary of our preprocessing"""
    print("📈 Final Analysis Summary\n")
    print("="*60)

    if 'chunks' not in globals() or not chunks:
        print("No chunks to analyze - run test_single_file() first\n")
        return

    chunk_data = []
    for chunk in chunks:
        chunk_data.append({
            'chunk_id': chunk.chunk_id,
            'tokens': chunk.token_count,
            'type': chunk.chunk_type,
            'section': chunk.section_info,
            'has_overlap': chunk.has_overlap,
            'ticker': chunk.filing_metadata.ticker,
            'form_type': chunk.filing_metadata.form_type,
            'fiscal_year': chunk.filing_metadata.fiscal_year
        })

    df = pd.DataFrame(chunk_data)

    print("🎯 Key Insights:\n")
    print(f"  • Document: {df['ticker'].iloc[0]} {df['form_type'].iloc[0]} (FY{df['fiscal_year'].iloc[0]})\n")
    print(f"  • Total chunks: {len(df)}\n")
    print(f"  • Average chunk size: {df['tokens'].mean():.0f} tokens\n")
    print(f"  • Size range: {df['tokens'].min()} - {df['tokens'].max()} tokens\n")
    print(f"  • Overlap rate: {(df['has_overlap'].sum() / len(df) * 100):.1f}%\n")

    print(f"\n📊 Chunk Distribution by Type:\n")
    type_dist = df['type'].value_counts()
    for chunk_type, count in type_dist.items():
        percentage = (count / len(df)) * 100
        print(f"  • {chunk_type}: {count} chunks ({percentage:.1f}%)\n")

    print(f"\n📚 Section Breakdown:\n")
    section_dist = df['section'].value_counts()
    for section, count in section_dist.head(8).items():
        print(f"  • {section}: {count} chunks\n")

    print(f"\n✅ Quality Metrics:\n")
    small_chunks = df[df['tokens'] < 50]
    print(f"  • Very small chunks (<50 tokens): {len(small_chunks)} ({len(small_chunks)/len(df)*100:.1f}%)\n")

    large_chunks = df[df['tokens'] > 800]
    print(f"  • Large chunks (>800 tokens): {len(large_chunks)} ({len(large_chunks)/len(df)*100:.1f}%)\n")

    unique_sections = df['section'].nunique()
    print(f"  • Unique sections identified: {unique_sections}\n")

    print(f"\n🔍 Sample Chunks for Review:\n")
    for chunk_type in df['type'].unique():
        sample = df[df['type'] == chunk_type].iloc[0]
        chunk_obj = next(c for c in chunks if c.chunk_id == sample['chunk_id'])
        print(f"\n  {chunk_type.upper()} example ({sample['tokens']} tokens):\n")
        print(f"    Section: {sample['section']}\n")
        print(f"    Preview: {chunk_obj.text[:150]}...\n")

    return df

summary_df = create_analysis_summary()


def compare_with_original():
    """Compare our approach with the original chunking strategy"""
    print("⚖️ Comparison: New vs Original Approach\n")
    print("="*60)

    improvements = [
        "✅ Multi-strategy section detection (fallbacks for robustness)",
        "✅ Sentence-aware chunking (preserves semantic boundaries)",
        "✅ Overlapping chunks (maintains context across boundaries)",
        "✅ Separate table processing (handles structured data better)",
        "✅ Comprehensive error handling (graceful degradation)",
        "✅ Rich metadata structure (better for search/filtering)",
        "✅ Quality validation (ensures chunk coherence)",
        "✅ Configurable parameters (tunable for different use cases)"
    ]

    potential_tradeoffs = [
        "⚠️ Slightly more complex code (but more maintainable)",
        "⚠️ More chunks due to overlap (but better retrieval)",
        "⚠️ Processing takes longer (but more robust results)"
    ]

    print("🚀 Key Improvements:\n")
    for improvement in improvements:
        print(f"  {improvement}\n")

    print(f"\n⚖️ Potential Tradeoffs:\n")
    for tradeoff in potential_tradeoffs:
        print(f"  {tradeoff}\n")

    print(f"\n🎯 Recommended Next Steps:\n")
    next_steps = [
        "1. Test on more diverse filings to validate robustness",
        "2. Fine-tune chunking parameters based on embedding performance",
        "3. Add semantic similarity checks between overlapping chunks",
        "4. Implement incremental processing for large datasets",
        "5. Add support for other SEC forms (8-K, DEF 14A, etc.)",
        "6. Create embedding quality metrics and evaluation"
    ]

    for step in next_steps:
        print(f"  {step}\n")

    print("\n" + "="*60)
    print("🎉 Preprocessing Strategy Testing Complete!\n")
    print("="*60)
    print("Next step: Convert this notebook into modular Python files\n")
    print("Then: Implement the embedding pipeline and MCP server!\n")
    print("="*60)

compare_with_original()

print("🚀 Ready to test universal SEC detection!\n")
print("\n1. Run test_universal_detection_fixed() to test all files\n")
print("2. Run compare_old_vs_universal_fixed() to see the improvement\n")
print("3. Run quick_pattern_test_fixed() to see what patterns match\n")

# Define the _fixed test functions so they are available when called below
def test_universal_detection_fixed():
    """Test the universal detection on all your file types"""

    test_files = [
        "processed_filings/AAPL/AAPL_10K_2020-10-30.txt",
        "processed_filings/AMZN/AMZN_10K_2023-02-03.txt",
        "processed_filings/AMZN/AMZN_10Q_2024-11-01.txt",
        "processed_filings/KO/KO_10Q_2020-07-22.txt"
    ]

    results = {}

    for test_file in test_files:
        if not os.path.exists(test_file):
            print(f"⚠️ Skipping {test_file} - file not found\n")
            continue

        print(f"\n🧪 Testing: {test_file}\n")
        print("=" * 80)

        with open(test_file, 'r', encoding='utf-8') as f:
            content = f.read()

        sections = detect_sections_robust_universal(content)

        print(f"\n✅ Found {len(sections)} sections:\n")
        for i, section in enumerate(sections[:10]):
            print(f"  {i+1}. {section.title}\n")
            print(f"     Type: {section.section_type}, Length: {len(section.content):,} chars\n")

        chunks = process_filing_robust_universal(test_file)
        stats = validate_chunks(chunks) if chunks else {"error": "No chunks created"}

        results[test_file] = {
            'sections': len(sections),
            'chunks': len(chunks) if chunks else 0,
            'stats': stats
        }

        print(f"\n📊 Processing Results:\n")
        for key, value in stats.items():
            print(f"  {key}: {value}\n")

        if chunks:
            section_counts = {}
            for chunk in chunks[:20]:
                section = chunk.section_info
                section_counts[section] = section_counts.get(section, 0) + 1

            print(f"\n📚 Section Distribution (sample):\n")
            for section, count in sorted(section_counts.items()):
                print(f"  • {section}: {count} chunks\n")

    print(f"\n" + "="*80)
    print("📊 UNIVERSAL DETECTION SUMMARY\n")
    print("="*80)

    for file_path, result in results.items():
        filename = file_path.split('/')[-1]
        print(f"{filename:<25} | {result['sections']:>2} sections | {result['chunks']:>3} chunks\n")

    return results

def compare_old_vs_universal_fixed():
    """Compare the old detection vs universal detection"""
    test_file = "processed_filings/AAPL/AAPL_10K_2020-10-30.txt"

    if not os.path.exists(test_file):
        print("Test file not found for comparison\n")
        return

    print("⚖️ OLD vs UNIVERSAL Detection Comparison\n")
    print("="*60)

    with open(test_file, 'r', encoding='utf-8') as f:
        content = f.read()

    print("Running old detection...\n")
    old_sections = detect_sections_robust_old(content)

    print("Running universal detection...\n")
    new_sections = detect_sections_robust_universal(content)

    print(f"\n📊 Comparison Results:\n")
    print(f"  Old detection: {len(old_sections)} sections\n")
    print(f"  Universal detection: {len(new_sections)} sections\n")
    print(f"  Improvement: +{len(new_sections) - len(old_sections)} sections\n")

    print(f"\n📋 Old Sections:\n")
    for i, section in enumerate(old_sections):
        print(f"  {i+1}. {section.title}\n")

    print(f"\n📋 Universal Sections:\n")
    for i, section in enumerate(new_sections):
        print(f"  {i+1}. {section.title}\n")

    return old_sections, new_sections

def quick_pattern_test_fixed():
    """Quick test to see what patterns match in your content"""
    test_file = "processed_filings/AAPL/AAPL_10K_2020-10-30.txt"

    if not os.path.exists(test_file):
        print("Test file not found\n")
        return

    print("🔍 QUICK PATTERN TEST\n")
    print("="*50)

    with open(test_file, 'r', encoding='utf-8') as f:
        content = f.read()

    patterns = [
        (re.compile(r'\[TABLE_START\](?:.|\n)*?Item(?:.|\n)*?\[TABLE_END\]', re.I | re.DOTALL), "Table-wrapped Items"),
        (re.compile(r'Item\s+\d+[A-C]?\.\s*\|', re.I), "Pipe-separated Items"),
        (re.compile(r'PART\s+[IVX]+', re.I), "Part headers"),
        (re.compile(r'\[TABLE_START\](?:.|\n)*?PART(?:.|\n)*?\[TABLE_END\]', re.I | re.DOTALL), "Table-wrapped Parts"),
    ]

    for compiled_pattern, description in patterns:
        matches = compiled_pattern.findall(content)
        print(f"\n{description}: {len(matches)} matches\n")
        for i, match in enumerate(matches[:3]):
            clean_match = ' '.join(match.split())[:100]
            print(f"  {i+1}: {clean_match}...\n")

# Run the fixed tests
results_universal = test_universal_detection_fixed()
old_vs_new_sections = compare_old_vs_universal_fixed()
quick_pattern_test_fixed()

INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 0 unique sections:
INFO:__main__:Found table of contents (1367 chars)
INFO:__main__:Extracted 9 sections from table of contents:
INFO:__main__:  • 00: $...
INFO:__main__:  • 04: $...
INFO:__main__:  • 18: $...
INFO:__main__:  • 26: $...
INFO:__main__:  • 37: $...
INFO:__main__:  • 40: $...
INFO:__main__:  • 56: $...
INFO:__main__:  • 58: $...
INFO:__main__:  • 68: $...
INFO:__main__:TOC analysis found 9 potential sections. Attempting to extract content based on TOC titles.
INFO:__main__:Found 1 sections in AAPL_10K_2020-10-30.txt
INFO:__main__:Created 172 chunks for AAPL_10K_2020-10-30.txt
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 0 unique sections:
INFO:__main__:Found table of contents (1367 chars)
INFO:__main__:Extracted 9 sections from table of contents:
INFO:__main__:  • 00: $...
INFO:__main__:  • 04: $...
INFO:__main__:

🚀 SEC Filing Preprocessing Strategy - Ready for Testing!

Key improvements over original approach:

✅ Multi-strategy section detection with fallbacks

✅ Sentence-aware chunking with overlap

✅ Robust error handling and logging

✅ Structured data classes for better organization

✅ Quality validation and statistics

✅ Separate table and narrative processing

🧪 Testing with: processed_filings/AAPL/AAPL_10K_2020-10-30.txt

📊 Processing Results:

  total_chunks: 172

  avg_tokens: 379.86046511627904

  min_tokens: 38

  max_tokens: 1692

  chunks_with_overlap: 105

  table_chunks: 66

  narrative_chunks: 106

  unique_sections: 1


📝 Sample Chunks:


Chunk 1 (table):

  Section: Full Document

  Tokens: 58

  Text preview: California | 94-2404110 | (State or other jurisdiction | of incorporation or organization) | (I.R.S. Employer Identification No.) | One Apple Park Way | Cupertino | , | California | 95014 | (Address o...


Chunk 2 (table):

  Section: Full Document

  Tokens: 240

  Text 

INFO:__main__:🔍 Universal SEC detection found 0 unique sections:
INFO:__main__:Found table of contents (1367 chars)
INFO:__main__:Extracted 9 sections from table of contents:
INFO:__main__:  • 00: $...
INFO:__main__:  • 04: $...
INFO:__main__:  • 18: $...
INFO:__main__:  • 26: $...
INFO:__main__:  • 37: $...
INFO:__main__:  • 40: $...
INFO:__main__:  • 56: $...
INFO:__main__:  • 58: $...
INFO:__main__:  • 68: $...
INFO:__main__:TOC analysis found 9 potential sections. Attempting to extract content based on TOC titles.
INFO:__main__:Found 1 sections in AAPL_10K_2020-10-30.txt
INFO:__main__:Created 172 chunks for AAPL_10K_2020-10-30.txt
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 0 unique sections:
INFO:__main__:Found table of contents (1367 chars)
INFO:__main__:Extracted 9 sections from table of contents:
INFO:__main__:  • 00: $...
INFO:__main__:  • 04: $...
INFO:__main__:  • 18: $...
INFO:__main__:  • 26: $...
INFO:__main__:  •

  Total chunks: 172

  Avg tokens: 379.9

  Overlap rate: 105/172


🧪 Testing: Large chunks, high overlap

  Total chunks: 127

  Avg tokens: 495.8

  Overlap rate: 60/127

🛡️ Testing Error Handling

Test 1: Non-existent file

  Result: 0 chunks (expected 0)


Test 2: Empty content

  Result: 1 sections


Test 3: Malformed filename

  Result: 0 chunks (expected 0)


Test 4: Very short text

  Result: 0 chunks

🔄 Testing Batch Processing (max 3 files)

Processing 3 files...

  1/3: AMZN_10Q_2022-04-29.txt

  2/3: AMZN_10Q_2020-05-01.txt



INFO:__main__:🔍 Universal SEC detection found 0 unique sections:
INFO:__main__:Found table of contents (901 chars)
INFO:__main__:Extracted 22 sections from table of contents:
INFO:__main__:  • 1: ...
INFO:__main__:  • 1: Financial Statements...
INFO:__main__:  • 1: Legal Proceedings...
INFO:__main__:  • 1A: ...
INFO:__main__:  • 1A: Risk Factors...
INFO:__main__:  • 2: ...
INFO:__main__:  • 2: Management’s Discussion and Analysis of Financial ...
INFO:__main__:  • 2: Unregistered Sales of Equity Securities and Use of...
INFO:__main__:  • 3: ...
INFO:__main__:  • 3: Consolidated Statements of Operations...
INFO:__main__:TOC analysis found 22 potential sections. Attempting to extract content based on TOC titles.
INFO:__main__:Found 1 sections in AMZN_10Q_2020-05-01.txt
INFO:__main__:Created 195 chunks for AMZN_10Q_2020-05-01.txt
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 0 unique sections:
INFO:__main__:Found table of contents (

  3/3: AMZN_10Q_2020-10-30.txt


📊 Batch Processing Summary:

  Total files processed: 3

  Total chunks created: 440

  Average chunks per file: 146.7


📋 Per-file results:

  AMZN_10Q_2022-04-29.txt: 125 chunks, 1 sections, 51 tables

  AMZN_10Q_2020-05-01.txt: 195 chunks, 1 sections, 131 tables

  AMZN_10Q_2020-10-30.txt: 120 chunks, 1 sections, 48 tables

📈 Final Analysis Summary

🎯 Key Insights:

  • Document: AAPL 10K (FY2020)

  • Total chunks: 172

  • Average chunk size: 380 tokens

  • Size range: 38 - 1692 tokens

  • Overlap rate: 61.0%


📊 Chunk Distribution by Type:

  • narrative: 106 chunks (61.6%)

  • table: 66 chunks (38.4%)


📚 Section Breakdown:

  • Full Document: 172 chunks


✅ Quality Metrics:

  • Very small chunks (<50 tokens): 2 (1.2%)

  • Large chunks (>800 tokens): 3 (1.7%)

  • Unique sections identified: 1


🔍 Sample Chunks for Review:


  TABLE example (58 tokens):

    Section: Full Document

    Preview: California | 94-2404110 | (State or other juris

INFO:__main__:Created 172 chunks for AAPL_10K_2020-10-30.txt
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 21 unique sections:
INFO:__main__:  1: Item/Part I - [TABLE_START]...
INFO:__main__:  2: Item/Part 1A - Risk Factors...
INFO:__main__:  3: Item/Part 1B - Unresolved Staff Comments...
INFO:__main__:  4: Item/Part 2 - Properties...
INFO:__main__:  5: Item/Part 3 - Legal Proceedings...
INFO:__main__:  6: Item/Part 4 - Mine Safety Disclosures...
INFO:__main__:  7: Item/Part II - [TABLE_START]...
INFO:__main__:  8: Item/Part 6 - Reserved...
INFO:__main__:  9: Item/Part 7A - Quantitative and Qualitative Disclosures About Market Risk...
INFO:__main__:  10: Item/Part 8 - Financial Statements and Supplementary Data...
INFO:__main__:  11: Item/Part unknown - Legal Proceedings...
INFO:__main__:  12: Item/Part 9 - Changes in and Disagreements with Accountants On Accounting ...
INFO:__main__:  13: Item/Part 9A - Controls and Procedures..


📊 Processing Results:

  total_chunks: 172

  avg_tokens: 379.86046511627904

  min_tokens: 38

  max_tokens: 1692

  chunks_with_overlap: 105

  table_chunks: 66

  narrative_chunks: 106

  unique_sections: 1


📚 Section Distribution (sample):

  • Full Document: 20 chunks


🧪 Testing: processed_filings/AMZN/AMZN_10K_2023-02-03.txt


✅ Found 21 sections:

  1. [TABLE_START]

     Type: part, Length: 13,293 chars

  2. Risk Factors

     Type: item, Length: 55,960 chars

  3. Unresolved Staff Comments

     Type: item, Length: 106 chars

  4. Properties

     Type: item, Length: 1,437 chars

  5. Legal Proceedings

     Type: item, Length: 185 chars

  6. Mine Safety Disclosures

     Type: item, Length: 113 chars

  7. [TABLE_START]

     Type: part, Length: 516 chars

  8. Reserved

     Type: item, Length: 50,497 chars

  9. Quantitative and Qualitative Disclosures About Market Risk

     Type: item, Length: 6,524 chars

  10. Financial Statements and Supplementary Data

     Type:

INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 0 unique sections:
INFO:__main__:Found table of contents (903 chars)
INFO:__main__:Extracted 22 sections from table of contents:
INFO:__main__:  • 1: ...
INFO:__main__:  • 1: Financial Statements...
INFO:__main__:  • 1: Legal Proceedings...
INFO:__main__:  • 1A: ...
INFO:__main__:  • 1A: Risk Factors...
INFO:__main__:  • 2: ...
INFO:__main__:  • 2: Management’s Discussion and Analysis of Financial ...
INFO:__main__:  • 2: Unregistered Sales of Equity Securities and Use of...
INFO:__main__:  • 3: ...
INFO:__main__:  • 3: Consolidated Statements of Operations...
INFO:__main__:TOC analysis found 22 potential sections. Attempting to extract content based on TOC titles.
INFO:__main__:Found 1 sections in AMZN_10Q_2024-11-01.txt
INFO:__main__:Created 132 chunks for AMZN_10Q_2024-11-01.txt
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 8


📊 Processing Results:

  total_chunks: 132

  avg_tokens: 366.43939393939394

  min_tokens: 7

  max_tokens: 1548

  chunks_with_overlap: 81

  table_chunks: 50

  narrative_chunks: 82

  unique_sections: 1


📚 Section Distribution (sample):

  • Full Document: 20 chunks


🧪 Testing: processed_filings/KO/KO_10Q_2020-07-22.txt


✅ Found 8 sections:

  1. . Financial Information

     Type: part, Length: 115,924 chars

  2. Management's Discussion and Analysis of Financial Condition and Results of Operations

     Type: item, Length: 87,923 chars

  3. Quantitative and Qualitative Disclosures About Market Risk

     Type: item, Length: 207 chars

  4. Controls and Procedures

     Type: item, Length: 1,004 chars

  5. . Other Information

     Type: part, Length: 248 chars

  6. Risk Factors

     Type: item, Length: 11,661 chars

  7. Unregistered Sales of Equity Securities and Use of Proceeds

     Type: item, Length: 2,127 chars

  8. Exhibits

     Type: item, Length: 13,918 chars



In [4]:
import os
import re
import pandas as pd
import tiktoken
from typing import List, Dict, Any, Tuple, Optional
from dataclasses import dataclass
from datetime import datetime
import logging
from pathlib import Path

# Set up logging to see what's happening
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Initialize tokenizer for accurate token counting
encoding = tiktoken.encoding_for_model("text-embedding-3-small")

# =============================================================================
# 1. SEC MAPPINGS WITH FALLBACKS
# =============================================================================

ITEM_NAME_MAP_10K = {
    "1": "Business",
    "1A": "Risk Factors",
    "1B": "Unresolved Staff Comments",
    "1C": "Cybersecurity",
    "2": "Properties",
    "3": "Legal Proceedings",
    "4": "Mine Safety Disclosures",
    "5": "Market for Registrant's Common Equity, Related Stockholder Matters and Issuer Purchases of Equity Securities",
    "6": "Reserved",
    "7": "Management's Discussion and Analysis of Financial Condition and Results of Operations",
    "7A": "Quantitative and Qualitative Disclosures About Market Risk",
    "8": "Financial Statements and Supplementary Data",
    "9": "Changes in and Disagreements With Accountants on Accounting and Financial Disclosure",
    "9A": "Controls and Procedures",
    "9B": "Other Information",
    "9C": "Disclosure Regarding Foreign Jurisdictions that Prevent Inspections",
    "10": "Directors, Executive Officers and Corporate Governance",
    "11": "Executive Compensation",
    "12": "Security Ownership of Certain Beneficial Owners and Management and Related Stockholder Matters",
    "13": "Certain Relationships and Related Transactions, and Director Independence",
    "14": "Principal Accountant Fees and Services",
    "15": "Exhibits, Financial Statement Schedules",
    "16": "Form 10-K Summary"
}

ITEM_NAME_MAP_10Q_PART_I = {
    "1": "Financial Statements",
    "2": "Management's Discussion and Analysis of Financial Condition and Results of Operations",
    "3": "Quantitative and Qualitative Disclosures About Market Risk",
    "4": "Controls and Procedures",
}

ITEM_NAME_MAP_10Q_PART_II = {
    "1": "Legal Proceedings", "1A": "Risk Factors",
    "2": "Unregistered Sales of Equity Securities and Use of Proceeds",
    "3": "Defaults Upon Senior Securities", "4": "Mine Safety Disclosures",
    "5": "Other Information", "6": "Exhibits",
}

# =============================================================================
# 2. DATA STRUCTURES FOR BETTER ORGANIZATION
# =============================================================================

@dataclass
class FilingMetadata:
    """Structured metadata for a filing"""
    ticker: str
    form_type: str
    filing_date: str
    fiscal_year: int
    fiscal_quarter: int
    file_path: str

@dataclass
class DocumentSection:
    """Represents a section of the document"""
    title: str
    content: str
    section_type: str  # 'item', 'part', 'intro', 'table'
    item_number: Optional[str] = None
    part: Optional[str] = None
    start_pos: int = 0
    end_pos: int = 0

@dataclass
class Chunk:
    """Final chunk with all metadata"""
    chunk_id: str
    text: str
    token_count: int
    chunk_type: str  # 'narrative', 'table', 'mixed'
    section_info: str
    filing_metadata: FilingMetadata
    chunk_index: int
    has_overlap: bool = False

# =============================================================================
# 3. ROBUST TEXT CLEANING
# =============================================================================

def clean_sec_text(text: str) -> str:
    """
    Clean SEC filing text more robustly
    """
    # Remove common SEC artifacts
    text = re.sub(r'UNITED STATES\s+SECURITIES AND EXCHANGE COMMISSION.*?FORM \d+[A-Z]*', '', text, flags=re.DOTALL | re.IGNORECASE)

    # Handle page breaks more intelligently
    text = text.replace('[PAGE BREAK]', '\n\n--- PAGE BREAK ---\n\n')

    # Preserve table boundaries but clean them up
    text = re.sub(r'\[TABLE_START\]', '\n\n=== TABLE START ===\n', text)
    text = re.sub(r'\[TABLE_END\]', '\n=== TABLE END ===\n\n', text)

    # Clean up excessive whitespace but preserve paragraph structure
    text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text)  # Multiple newlines -> double newline
    text = re.sub(r'[ \t]+', ' ', text)  # Multiple spaces/tabs -> single space
    text = re.sub(r'^\s+|\s+$', '', text, flags=re.MULTILINE)  # Trim lines

    return text.strip()

# =============================================================================
# 4. MULTI-STRATEGY SECTION DETECTION
# =============================================================================

def detect_sections_strategy_1_improved(content: str) -> List[DocumentSection]:
    """
    Improved Strategy 1: Patterns based on real SEC filing structure
    """
    sections = []

    # Much more comprehensive patterns based on your actual files
    patterns = [
        # PART patterns - handle various formats
        re.compile(r'^\s*PART\s+([IVX]+)(?:\s*[-–—].*?)?$', re.I | re.M),
        re.compile(r'^PART\s+([IVX]+)(?:\s*[-–—].*?)?$', re.I | re.M),

        # ITEM patterns - much more flexible
        re.compile(r'^\s*ITEM\s+(\d{1,2}[A-C]?)(?:[.\s–—])', re.I | re.M),
        re.compile(r'^ITEM\s+(\d{1,2}[A-C]?)(?:[.\s–—])', re.I | re.M),
        re.compile(r'Item\s+(\d{1,2}[A-C]?)(?:[.\s–—])', re.I | re.M),

        # Number-dot format common in SEC filings
        re.compile(r'^(\d{1,2}[A-C]?)\.\s+[A-Z][A-Za-z\s]{10,}', re.I | re.M),

        # Content-based patterns for known sections
        re.compile(r'^.{0,50}(BUSINESS)\s*$', re.I | re.M),
        re.compile(r'^.{0,50}(RISK FACTORS)\s*$', re.I | re.M),
        re.compile(r'^.{0,50}(LEGAL PROCEEDINGS)\s*$', re.I | re.M),
        re.compile(r'^.{0,50}(FINANCIAL STATEMENTS)\s*$', re.I | re.M),
        re.compile(r'^.{0,50}(MANAGEMENT.S DISCUSSION)\s*', re.I | re.M),
        re.compile(r'^.{0,50}(PROPERTIES)\s*$', re.I | re.M),
        re.compile(r'^.{0,50}(CONTROLS AND PROCEDURES)\s*$', re.I | re.M),
    ]

    all_matches = []

    # Process each pattern
    for pattern_idx, pattern in enumerate(patterns):
        for match in pattern.finditer(content): # Use pre-compiled pattern
            # Get the full line containing this match
            line_start = content.rfind('\n', 0, match.start()) + 1
            line_end = content.find('\n', match.end())
            if line_end == -1:
                line_end = len(content)

            full_line = content[line_start:line_end].strip()

            # Filter out obvious false positives
            if (len(full_line) > 400 or  # Too long to be a header
                len(full_line) < 3 or    # Too short
                '|' in full_line or      # Likely table content
                full_line.count(' ') > 20):  # Too many words
                continue

            # Extract section identifier
            section_id = match.group(1) if match.groups() else 'unknown'

            all_matches.append({
                'start_pos': line_start,
                'end_pos': line_end,
                'full_line': full_line,
                'section_id': section_id,
                'pattern_idx': pattern_idx,
                'match_start': match.start()
            })

    # Remove duplicates - matches within 200 characters of each other
    unique_matches = []
    for match in sorted(all_matches, key=lambda x: x['start_pos']):
        is_duplicate = any(
            abs(match['start_pos'] - existing['start_pos']) < 200
            for existing in unique_matches
        )
        if not is_duplicate:
            unique_matches.append(match)

    # Debug output
    print(f"🔍 Improved detection found {len(unique_matches)} potential sections:")
    for i, match in enumerate(unique_matches[:15]):  # Show more for debugging
        print(f"  {i+1}: {match['full_line'][:80]}...")

    # Convert to DocumentSection objects
    for i, match in enumerate(unique_matches):
        start_pos = match['start_pos']
        end_pos = unique_matches[i + 1]['start_pos'] if i + 1 < len(unique_matches) else len(content)

        section_content = content[start_pos:end_pos].strip()

        # Determine section type and metadata
        full_line_upper = match['full_line'].upper()
        section_id = match['section_id'].upper() if match['section_id'] != 'unknown' else None

        if 'PART' in full_line_upper and section_id:
            section_type = 'part'
            part = f"PART {section_id}"
            item_number = None
            title = f"Part {section_id}"
        elif ('ITEM' in full_line_upper or re.match(r'^\d+[A-C]?$', str(section_id))) and section_id:
            section_type = 'item'
            part = None
            item_number = section_id
            title = f"Item {section_id}"
        elif any(keyword in full_line_upper for keyword in
                ['BUSINESS', 'RISK', 'LEGAL', 'FINANCIAL', 'MANAGEMENT', 'PROPERTIES', 'CONTROLS']):
            section_type = 'named_section'
            part = None
            item_number = None
            title = match['full_line']
        else:
            section_type = 'content'
            part = None
            item_number = None
            title = match['full_line']

        sections.append(DocumentSection(
            title=title,
            content=section_content,
            section_type=section_type,
            item_number=item_number,
            part=part,
            start_pos=start_pos,
            end_pos=end_pos
        ))

    return sections

def detect_sections_strategy_2(content: str) -> List[DocumentSection]:
    """
    Strategy 2: Fallback using page breaks and heuristics
    """
    sections = []

    # Split by page breaks first
    pages = content.split('--- PAGE BREAK ---')

    current_section = ""
    current_title = "Document Content"

    for i, page in enumerate(pages):
        page = page.strip()
        if not page:
            continue

        # Look for section headers in the page
        lines = page.split('\n')
        potential_headers = []

        for j, line in enumerate(lines[:10]):  # Check first 10 lines of each page
            line = line.strip()
            if (len(line) < 100 and  # Headers are usually short
                (re.search(r'\b(ITEM|PART)\b', line, re.IGNORECASE) or
                 re.search(r'\b(BUSINESS|RISK FACTORS|FINANCIAL STATEMENTS)\b', line, re.IGNORECASE))):
                potential_headers.append((j, line))

        if potential_headers:
            # Found a header, start new section
            if current_section:
                sections.append(DocumentSection(
                    title=current_title,
                    content=current_section.strip(),
                    section_type='content',
                    start_pos=0,
                    end_pos=len(current_section)
                ))

            current_title = potential_headers[0][1]
            current_section = page
        else:
            # Continue current section
            current_section += "\n\n" + page

    # Add the last section
    if current_section:
        sections.append(DocumentSection(
            title=current_title,
            content=current_section.strip(),
            section_type='content',
            start_pos=0,
            end_pos=len(current_section)
        ))

    return sections

# The `detect_sections_robust` function from your original code (renamed detect_sections_robust_old to avoid conflict)
def detect_sections_robust_old(content: str) -> List[DocumentSection]:
    """
    Multi-strategy section detection with fallbacks (original version)
    """
    logger.info("Attempting Strategy 1: Regex-based section detection")
    sections = detect_sections_strategy_1_improved(content) # Original called detect_sections_strategy_1, updated to _improved

    if len(sections) >= 3:  # A reasonable number of sections to consider it successful
        logger.info(f"Strategy 1 successful: Found {len(sections)} sections")
        return sections

    logger.warning("Strategy 1 failed, trying Strategy 2: Page-based detection")
    sections = detect_sections_strategy_2(content)

    if len(sections) >= 2:
        logger.info(f"Strategy 2 successful: Found {len(sections)} sections")
        return sections

    logger.warning("All strategies failed, creating single section")
    return [DocumentSection(
        title="Full Document",
        content=content,
        section_type='document',
        start_pos=0,
        end_pos=len(content)
    )]

def create_section_info(section: DocumentSection, form_type: str) -> str:
    """
    Create human-readable section information for DocumentSection objects,
    using form_type to select the correct item name map.
    Handles 10K/10Q specific mappings and part/item inheritance.
    """
    item_number = section.item_number
    section_type = section.section_type
    part_number = section.part # This will be like "PART I", "PART II" or None

    if section_type == 'item' and item_number:
        if form_type == '10K':
            item_name = ITEM_NAME_MAP_10K.get(item_number, "Unknown Section")
            return f"Item {item_number} - {item_name}"
        elif form_type == '10Q':
            # Prioritize using the part_number if available from section detection
            if part_number == 'PART I':
                item_name = ITEM_NAME_MAP_10Q_PART_I.get(item_number, "Unknown Section")
                return f"Part I, Item {item_number} - {item_name}"
            elif part_number == 'PART II':
                item_name = ITEM_NAME_MAP_10Q_PART_II.get(item_number, "Unknown Section")
                return f"Part II, Item {item_number} - {item_name}"
            else:
                # Fallback: if part is not explicitly set, try to infer from item maps
                if item_number in ITEM_NAME_MAP_10Q_PART_I:
                    item_name = ITEM_NAME_MAP_10Q_PART_I[item_number]
                    return f"Part I, Item {item_number} - {item_name}"
                elif item_number in ITEM_NAME_MAP_10Q_PART_II:
                    item_name = ITEM_NAME_MAP_10Q_PART_II[item_number]
                    return f"Part II, Item {item_number} - {item_name}"
                return f"Item {item_number} - Unknown 10Q Section"
    
    elif section_type == 'part' and part_number:
        # If it's a PART section itself, format it. Includes cases where title might capture an item.
        if "Item" in section.title and section.item_number:
            # Example: "PART I - Item 1. Business" if detected that way
            # Strip "PART X" from the original title if it's already there to avoid duplication
            clean_title_suffix = section.title.replace(part_number, '').strip(' -.')
            return f"{part_number} - {clean_title_suffix}"
        return part_number # Just returns "PART I", "PART II" etc.

    # Fallback for named_section (e.g., "BUSINESS" without Item number), 'content', or 'document' types
    return section.title or "Document Content"


def detect_sections_universal_sec(content: str) -> List[DocumentSection]:
    """
    Universal section detection for SEC filings with table-based formatting.
    Improved regex patterns for better capture of Item/Part numbers and titles.
    """
    sections = []

    if not content:
        logger.info("Empty content provided to detect_sections_universal_sec. Returning empty sections.")
        return sections

    # Universal patterns for table-formatted SEC filings
    # Using raw strings `r` and explicitly handling whitespace `\s*` and literal characters.
    # Compiling patterns once for efficiency.
    patterns = [
        # Table-based ITEM patterns: e.g., "[TABLE_START] Item 1. | Business..."
        re.compile(r'(?i)\[TABLE_START\]\s*Item\s*(\d{1,2}[A-C]?)\.?\s*\|\s*([^\[]+?)\s*\[TABLE_END\]', re.DOTALL),
        re.compile(r'(?i)\[TABLE_START\]\s*Item\s*(\d{1,2}[A-C]?)\.?\s*\|\s*([^|]+)', re.DOTALL),

        # Table-based PART patterns: e.g., "[TABLE_START] PART I | FINANCIAL INFORMATION..."
        re.compile(r'(?i)\[TABLE_START\]\s*PART\s*([IVX]+)\s*\|\s*([^\[]+?)\s*\[TABLE_END\]', re.DOTALL),
        re.compile(r'(?i)\[TABLE_START\]\s*PART\s*([IVX]+)\s*\|\s*([^|]+)', re.DOTALL),
        re.compile(r'(?i)\[TABLE_START\]\s*PART\s*([IVX]+)\s*\[TABLE_END\]', re.DOTALL),

        # Standalone ITEM patterns (strong indicators, start of line): e.g., "Item 1. Business"
        re.compile(r'^\s*Item\s*(\d{1,2}[A-C]?)\.?\s*([^\n]+)', re.I | re.M),
        # Standalone ITEM patterns (pipe-separated but not necessarily table-wrapped): e.g., "Item 1. | Business"
        re.compile(r'Item\s*(\d{1,2}[A-C]?)\.?\s*\|\s*([^|]+)', re.I | re.DOTALL),

        # Standalone PART patterns (strong indicators, start of line): e.g., "PART I. FINANCIAL INFORMATION"
        re.compile(r'^\s*PART\s*([IVX]+)\.?\s*([^\n]*)', re.I | re.M),
        # Standalone PART patterns (pipe-separated): e.g., "PART I | FINANCIAL INFORMATION"
        re.compile(r'PART\s*([IVX]+)\s*\|\s*([^|]+)', re.I | re.DOTALL),

        # Number-dot format (e.g., "1. Business" not necessarily preceded by "Item", usually at start of line)
        re.compile(r'^\s*(\d{1,2}[A-C]?)\.\s+[A-Z][A-Za-z\s]{10,}', re.I | re.M),
        # Number-only pattern in tables (e.g., "[TABLE_START] 1. | Business")
        re.compile(r'(?i)\[TABLE_START\]\s*(\d{1,2}[A-C]?)\.?\s*\|\s*([^|]+)', re.I | re.DOTALL),

        # Generic Section Titles that often appear as headers (e.g., "BUSINESS", "RISK FACTORS")
        re.compile(r'^\s*(BUSINESS|RISK FACTORS|LEGAL PROCEEDINGS|FINANCIAL STATEMENTS|MANAGEMENT\'S DISCUSSION AND ANALYSIS OF FINANCIAL CONDITION AND RESULTS OF OPERATIONS|PROPERTIES|CONTROLS AND PROCEDURES)\s*$', re.I | re.M)
    ]

    all_matches = []

    for pattern_idx, pattern in enumerate(patterns):
        for match in pattern.finditer(content):
            # Determine content boundaries for the "line" containing the match
            line_start = content.rfind('\n', 0, match.start()) + 1
            line_end = content.find('\n', match.end())
            if line_end == -1:
                line_end = len(content)

            full_line = content[line_start:line_end].strip()

            # Filter out obvious false positives
            if (len(full_line) > 400 or  # Too long to be a header
                len(full_line) < 3 or    # Too short (e.g., "1.")
                ('TABLE' in full_line.upper() and ('START' in full_line.upper() or 'END' in full_line.upper())) or # Exclude table markers if not part of a valid section header
                full_line.count(' ') > 20):  # Too many words, likely not a header
                continue

            # Heuristic to filter out TOC entries that might match general patterns
            if any(toc_indicator in full_line.lower() for toc_indicator in ['table of contents', 'index']):
                continue
            
            section_id = None
            section_title = full_line # Default to full line

            groups = match.groups()
            if groups:
                potential_id = groups[0].strip()
                # Determine if the first captured group is a valid Item/Part ID
                is_item_id = re.match(r'^\d+[A-C]?$', potential_id, re.I)
                is_part_id = re.match(r'^[IVX]+$', potential_id, re.I)

                if is_item_id or is_part_id:
                    section_id = potential_id
                    if len(groups) > 1 and groups[1]: # If a title group was also captured
                        section_title = groups[1].strip()
                        # Clean up title: remove trailing table markers like "[TABLE_END]" if they were captured
                        section_title = re.sub(re.escape('[TABLE_END]') + r'.*', '', section_title, flags=re.I).strip()
                        section_title = section_title.replace('|', '').strip() # Remove pipe characters
                    else: # No title captured explicitly by the group
                        # Try to extract title from the rest of the line after the ID match
                        clean_line = re.sub(r'^\s*(Item|PART)\s*\d*[A-C]*[IVX]*\.?\s*[-–—]?\s*', '', full_line, flags=re.I).strip()
                        if clean_line and len(clean_line) < 200:
                            section_title = clean_line
                        else:
                             section_title = full_line # Fallback to full line if title extraction is problematic
                else: # First captured group was not a standard Item/Part ID, treat as part of title
                    section_title = full_line
                    # For generic named sections, use their fixed ID if applicable (e.g., BUSINESS -> 1)
                    if 'BUSINESS' in full_line.upper() and 'ITEM' not in full_line.upper(): section_id = '1'
                    elif 'RISK FACTORS' in full_line.upper() and 'ITEM' not in full_line.upper(): section_id = '1A'

            # Store the original start/end of the line for correct content extraction
            all_matches.append({
                'start_pos': line_start,
                'end_pos': line_end,
                'full_line': full_line,
                'section_id': section_id if section_id else 'unknown',
                'section_title': section_title,
                'pattern_idx': pattern_idx,
                'match_start': match.start()
            })

    # Sort matches primarily by start_pos, secondarily by pattern_idx (to prefer more specific patterns early in the list)
    all_matches.sort(key=lambda x: (x['start_pos'], x['pattern_idx']))

    # Filter duplicate/overlapping matches. Prioritize more specific patterns (lower pattern_idx).
    final_matches = []
    if all_matches:
        final_matches.append(all_matches[0])
        for i in range(1, len(all_matches)):
            current_match = all_matches[i]
            last_added_match = final_matches[-1]

            # If current match starts very close to the last added match,
            # consider if it's a duplicate or a better alternative.
            if current_match['start_pos'] - last_added_match['start_pos'] < 100: # Within 100 chars
                # Prefer matches with a specific Item/Part ID over 'unknown' or less specific types
                if current_match['section_id'] != 'unknown' and last_added_match['section_id'] == 'unknown':
                    final_matches[-1] = current_match
                # If both are specific, prefer the one matched by a higher-priority pattern (lower index)
                elif current_match['section_id'] != 'unknown' and last_added_match['section_id'] != 'unknown' and current_match['pattern_idx'] < last_added_match['pattern_idx']:
                    final_matches[-1] = current_match
                # If they have the same ID but the new match offers a cleaner/more robust title
                elif current_match['section_id'] == last_added_match['section_id'] and len(current_match['section_title']) < len(last_added_match['section_title']) * 0.8: # Heuristic for "cleaner"
                     final_matches[-1] = current_match
                # Otherwise, if it's too close and not a better candidate, skip as duplicate
            else:
                final_matches.append(current_match) # Add if sufficiently far apart

    logger.info(f"🔍 Universal SEC detection found {len(final_matches)} unique sections:")
    for i, match in enumerate(final_matches[:15]):
        logger.info(f"  {i+1}: Item/Part {match['section_id']} - {match['section_title'][:60]}...")

    # Convert to DocumentSection objects
    final_document_sections = []
    current_part = None # Track current part for 10Q item context

    for i, match in enumerate(final_matches):
        start_pos = match['start_pos']
        end_pos = final_matches[i + 1]['start_pos'] if i + 1 < len(final_matches) else len(content)

        section_content = content[start_pos:end_pos].strip()

        section_id = match['section_id'].upper()
        title = match['section_title']

        section_type = 'content' # Default type
        item_number = None
        part = None

        if re.match(r'^[IVX]+$', section_id):
            section_type = 'part'
            part = f"PART {section_id}"
            current_part = part # Update current part for subsequent items
            # Refine title: remove "PART X" if it's already in the title to avoid redundancy.
            clean_title_part = title.upper().replace(part, '').strip(' -.')
            if clean_title_part:
                title = f"{part} - {clean_title_part}"
            else:
                title = part # Fallback to just "PART X"
        elif re.match(r'^\d+[A-C]?$', section_id):
            section_type = 'item'
            item_number = section_id
            part = current_part # Assign current part context to this item (inherited)
            # Refine title: remove "Item X" if it's already in the title
            clean_title_item = title.upper().replace(f"ITEM {item_number}", '').strip(' -.')
            if clean_title_item:
                title = f"Item {item_number} - {clean_title_item}"
            else:
                title = f"Item {item_number}" # Fallback to just "Item X"
        # For named_section (e.g., "BUSINESS" when it's not explicitly an Item number)
        elif any(keyword in title.upper() for keyword in ['BUSINESS', 'RISK FACTORS', 'LEGAL PROCEEDINGS', 'FINANCIAL STATEMENTS', 'MANAGEMENT\'S DISCUSSION', 'PROPERTIES', 'CONTROLS AND PROCEDURES']):
            section_type = 'named_section'


        final_document_sections.append(DocumentSection(
            title=title,
            content=section_content,
            section_type=section_type,
            item_number=item_number,
            part=part, # Store the part info (either detected directly or inherited)
            start_pos=start_pos,
            end_pos=end_pos
        ))

    return final_document_sections

def detect_sections_from_toc_universal(content: str) -> List[DocumentSection]:
    """
    Extract sections from table of contents - works for any SEC filing.
    This function primarily identifies section titles and item numbers from TOC,
    but does not extract their content directly.
    """
    sections = []

    if not content:
        logger.info("Empty content provided to detect_sections_from_toc_universal. Returning empty sections.")
        return sections

    # Look for table of contents patterns
    # Using re.escape for literal brackets, and compiling patterns once.
    toc_patterns = [
        re.compile(r'(?i)INDEX.*?(?=\s*--- PAGE BREAK ---)', re.DOTALL),
        re.compile(r'(?i)TABLE OF CONTENTS.*?(?=\s*--- PAGE BREAK ---)', re.DOTALL),
        re.compile(r'(?i)FORM 10-[KQ].*?INDEX.*?(?=\s*--- PAGE BREAK ---)', re.DOTALL),
        re.compile(re.escape('[TABLE_START]') + r'.*?Page.*?' + re.escape('[TABLE_END]') + r'.*?(?=\s*--- PAGE BREAK ---)', re.DOTALL),
    ]

    toc_content = ""
    for pattern in toc_patterns:
        match = pattern.search(content)
        if match:
            toc_content = match.group(0)
            break

    if not toc_content:
        logger.warning("No table of contents found in detect_sections_from_toc_universal.")
        return sections

    logger.info(f"Found table of contents ({len(toc_content)} chars)")

    # Define patterns for items/parts within the TOC
    # CORRECTED: Significant refinement here. Removed redundant escapes,
    # and ensured capture groups correctly isolate ID and title without extra junk.
    item_patterns = [
        # Standard table format: "Item X. Title | Page" or "Item X. Title" or "Item X | Title"
        re.compile(r'(?i)Item\s*(\d{1,2}[A-C]?)\.?\s*(?:\|\s*|\s*([^\n|]+?)\s*(?:\|\s*\d+)?$|([^\n|]+)$)', re.M),
        # Standard table format: "PART X | Title"
        re.compile(r'(?i)PART\s*([IVX]+)\s*\|\s*([^\n|]+)', re.M),
        # Standalone Item line (e.g., "Item 1A. Risk Factors")
        re.compile(r'(?i)^\s*Item\s*(\d{1,2}[A-C]?)\.?\s*([^\n]+)', re.M),
        # Standalone Part line (e.g., "PART II. Other Information")
        re.compile(r'(?i)^\s*PART\s*([IVX]+)\.?\s*([^\n]*)', re.M),
        # Number-dot format (e.g., "1. Financial Statements") often seen as main heading in TOC tables
        re.compile(r'^\s*(\d{1,2}[A-C]?)\.\s*([^\n|]+)', re.M),
    ]

    found_items = []
    if toc_content: # Only try to find items if TOC content was found
        for pattern in item_patterns:
            for match in pattern.finditer(toc_content):
                groups = match.groups()
                item_id = None
                item_title = ""

                # Extract ID (first non-None group)
                if groups[0] is not None:
                    item_id = groups[0].strip()
                else: # This case is for patterns where the ID isn't the first group, or if pattern needs adjustment
                    continue # Skip if no ID is clearly captured by the primary group

                # Extract title (find the first non-None group after the ID)
                if len(groups) > 1:
                    for g in groups[1:]:
                        if g is not None:
                            item_title = g.strip()
                            break
                
                # Further clean title to remove common TOC artifacts
                item_title = re.sub(r'\|\s*\d+', '', item_title).strip() # Remove "| PageNumber"
                item_title = re.sub(r'\s*\.\s*$', '', item_title).strip() # Remove trailing periods
                item_title = re.sub(r'\s+', ' ', item_title).strip() # Normalize whitespace

                if item_id and item_title: # Ensure both ID and a meaningful title are present
                    found_items.append((item_id, item_title))
                elif item_id and not item_title: # If ID found but no title, use generic
                    if re.match(r'^\d+[A-C]?$', item_id, re.I):
                        item_title = f"Item {item_id}"
                    elif re.match(r'^[IVX]+$', item_id, re.I):
                        item_title = f"PART {item_id}"
                    if item_title:
                        found_items.append((item_id, item_title))


    unique_items = []
    seen = set()
    # Sort found items by their ID for more consistent processing, then by title for tie-breaking
    # Use tuple for sorting to maintain order when IDs are same
    found_items.sort(key=lambda x: (x[0], x[1]))

    for item_id, title in found_items:
        # Create a unique key for deduplication, focusing on ID and a portion of title
        # Use a more robust key to distinguish similar titles or items.
        key = (item_id, title) # Use full tuple for strict uniqueness for now
        if key not in seen:
            unique_items.append((item_id, title))
            seen.add(key)

    logger.info(f"Extracted {len(unique_items)} sections from table of contents:")
    for item_id, title in unique_items[:10]:
        logger.info(f"  • {item_id}: {title[:50]}...")

    toc_sections = []
    current_part = None # Track current part for items found in TOC

    for item_id, title in unique_items:
        section_type = 'unknown'
        item_number = None
        part_num = None # Initial value

        if re.match(r'^\d+[A-C]?$', item_id):
            section_type = 'item'
            item_number = item_id
            part_num = current_part # Assign the last seen part to this item
        elif re.match(r'^[IVX]+$', item_id):
            section_type = 'part'
            part_num = f"PART {item_id}"
            current_part = part_num # Update the current part context
        else:
            section_type = 'content' # Treat as generic content section if no item/part found

        toc_sections.append(DocumentSection(
            title=title,
            content="", # Content is intentionally empty here; will be filled by main sectioning if this strategy is chosen.
            section_type=section_type,
            item_number=item_number,
            part=part_num # Store the identified part (either detected or inherited)
        ))
    return toc_sections


def detect_sections_robust_universal(content: str) -> List[DocumentSection]:
    """
    Universal robust section detection for all SEC filings.
    Prioritizes direct pattern matching (which handles tables well), then TOC, then page-based.
    """
    logger.info("Attempting universal SEC section detection")

    # Strategy 1: Direct pattern matching for sections (designed to work well with common SEC patterns)
    sections_strategy1 = detect_sections_universal_sec(content)

    if len(sections_strategy1) >= 3:
        logger.info(f"Universal detection successful (Strategy 1): Found {len(sections_strategy1)} sections.")
        return sections_strategy1

    # Strategy 2: Try parsing Table of Contents.
    logger.warning("Direct detection found few sections, analyzing table of contents.")
    toc_entries = detect_sections_from_toc_universal(content) # These are DocumentSections with only title/metadata, no content

    if toc_entries and len(toc_entries) >= 3: # If TOC parsing yielded a good number of entries
        logger.info(f"TOC analysis found {len(toc_entries)} potential sections. Attempting to extract content based on TOC titles.")

        combined_sections = []
        current_content_pos = 0

        # Sort toc_entries by their estimated location, using Item/Part numbers for robust sorting
        # This is crucial for iterating and finding them correctly in the content.
        def get_sort_key(entry):
            if entry.part and re.match(r'PART\s*([IVX]+)', entry.part):
                # Convert Roman numerals to int for sorting
                roman_map = {'I':1, 'V':5, 'X':10, 'L':50, 'C':100, 'D':500, 'M':1000}
                part_int = 0
                for char in entry.part.replace("PART ", ""):
                    part_int += roman_map.get(char, 0)
                return (part_int, entry.item_number if entry.item_number else '')
            elif entry.item_number:
                # Convert item number (e.g., "1A") to a sortable tuple (1, 'A')
                num_part = re.match(r'(\d+)([A-C]?)', entry.item_number)
                if num_part:
                    base_num = int(num_part.group(1))
                    alpha_part = num_part.group(2) if num_part.group(2) else ''
                    return (base_num, alpha_part)
                return (float('inf'), entry.title) # Fallback for malformed item_number
            return (float('inf'), entry.title) # Fallback for entries with no part/item

        toc_entries_sorted = sorted(toc_entries, key=get_sort_key)


        for i, toc_entry in enumerate(toc_entries_sorted):
            # Create flexible regex for the title/item number to find it in the main content
            pattern_parts = []
            
            # Prioritize matching by Item/Part numbers as they are more consistent
            if toc_entry.item_number:
                pattern_parts.append(r'Item\s*' + re.escape(toc_entry.item_number) + r'\.?')
            if toc_entry.part:
                pattern_parts.append(r'PART\s*' + re.escape(toc_entry.part.replace("PART ", "")))
            
            # Add the full cleaned title as a fallback/alternative match
            if toc_entry.title:
                cleaned_title_for_regex = re.sub(r'\|\s*\d+', '', toc_entry.title).strip() # Remove page numbers like "| 3"
                cleaned_title_for_regex = re.sub(r'\s*\.\s*$', '', cleaned_title_for_regex).strip() # Remove trailing periods
                pattern_parts.append(re.escape(cleaned_title_for_regex).replace('\\ ', '\\s*'))

            if not pattern_parts: # Should not happen if TOC parsing is good
                continue

            search_pattern = re.compile(r'(?i)^\s*(?:' + '|'.join(pattern_parts) + r')', re.M)
            
            match = search_pattern.search(content, pos=current_content_pos)

            if match:
                start_pos = match.start()
                
                # The content for this section goes until the start of the next TOC entry, or end of document
                next_start_pos = len(content)
                if i + 1 < len(toc_entries_sorted): # Check the next entry in the *sorted* list
                    next_toc_entry = toc_entries_sorted[i+1]
                    next_pattern_parts = []
                    if next_toc_entry.item_number:
                        next_pattern_parts.append(r'Item\s*' + re.escape(next_toc_entry.item_number) + r'\.?')
                    if next_toc_entry.part:
                        next_pattern_parts.append(r'PART\s*' + re.escape(next_toc_entry.part.replace("PART ", "")))
                    if next_toc_entry.title:
                        next_cleaned_title_for_regex = re.sub(r'\|\s*\d+', '', next_toc_entry.title).strip()
                        next_cleaned_title_for_regex = re.sub(r'\s*\.\s*$', '', next_cleaned_title_for_regex).strip()
                        next_pattern_parts.append(re.escape(next_cleaned_title_for_regex).replace('\\ ', '\\s*'))

                    if next_pattern_parts:
                        next_pattern = re.compile(r'(?i)^\s*(?:' + '|'.join(next_pattern_parts) + r')', re.M)
                        next_match = next_pattern.search(content, pos=match.end()) # Search from end of current match
                        if next_match:
                            next_start_pos = next_match.start()
                
                section_content = content[start_pos:next_start_pos].strip()
                
                combined_sections.append(DocumentSection(
                    title=toc_entry.title,
                    content=section_content,
                    section_type=toc_entry.section_type,
                    item_number=toc_entry.item_number,
                    part=toc_entry.part, # Preserve the part info derived from TOC
                    start_pos=start_pos,
                    end_pos=next_start_pos
                ))
                current_content_pos = next_start_pos
            else:
                logger.warning(f"Could not find content for TOC entry: '{toc_entry.title}'. This section might be merged with previous or skipped.")

        if len(combined_sections) >= 3:
            logger.info(f"Universal detection successful (TOC-based content mapping): Found {len(combined_sections)} sections.")
            return combined_sections
        else:
            logger.warning("TOC-based content mapping yielded few sections. Falling back to page-based detection.")


    # Strategy 3: Page-based fallback (original strategy 2)
    logger.warning("Trying page-based detection as fallback.")
    sections_strategy2 = detect_sections_strategy_2(content)

    if len(sections_strategy2) >= 2:
        logger.info(f"Page-based detection successful: Found {len(sections_strategy2)} sections.")
        return sections_strategy2

    # Final fallback: return the entire document as a single section
    logger.warning("All strategies failed, creating single section.")
    return [DocumentSection(
        title="Full Document",
        content=content,
        section_type='document',
        start_pos=0,
        end_pos=len(content)
    )]

In [5]:
results_universal = test_universal_detection_fixed()
old_vs_new_sections = compare_old_vs_universal_fixed()
quick_pattern_test_fixed()

INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 19 unique sections:
INFO:__main__:  1: Item/Part 1 - Business...
INFO:__main__:  2: Item/Part 1A - Risk Factors...
INFO:__main__:  3: Item/Part 1B - Unresolved Staff Comments...
INFO:__main__:  4: Item/Part 3 - Legal Proceedings...
INFO:__main__:  5: Item/Part 4 - Mine Safety Disclosures...
INFO:__main__:  6: Item/Part 5 - Market for Registrant’s Common Equity, Related Stockholder M...
INFO:__main__:  7: Item/Part 6 - Selected Financial Data...
INFO:__main__:  8: Item/Part 7 - Management’s Discussion and Analysis of Financial Condition ...
INFO:__main__:  9: Item/Part 7A - Quantitative and Qualitative Disclosures About Market Risk...
INFO:__main__:  10: Item/Part 8 - Financial Statements and Supplementary Data...
INFO:__main__:  11: Item/Part 9 - Changes in and Disagreements with Accountants on Accounting ...
INFO:__main__:  12: Item/Part 9A - Controls and Procedures...
INFO:__main__:


🧪 Testing: processed_filings/AAPL/AAPL_10K_2020-10-30.txt


✅ Found 19 sections:

  1. Item 1 - BUSINESS

     Type: item, Length: 13,266 chars

  2. Item 1A - RISK FACTORS

     Type: item, Length: 61,136 chars

  3. Item 1B - UNRESOLVED STAFF COMMENTS

     Type: item, Length: 582 chars

  4. Item 3 - LEGAL PROCEEDINGS

     Type: item, Length: 898 chars

  5. Item 4 - MINE SAFETY DISCLOSURES

     Type: item, Length: 108 chars

  6. Item 5 - MARKET FOR REGISTRANT’S COMMON EQUITY, RELATED STOCKHOLDER MATTERS AND ISSUER PURCHASES OF EQUITY SECURITIES

     Type: item, Length: 4,182 chars

  7. Item 6 - SELECTED FINANCIAL DATA

     Type: item, Length: 1,745 chars

  8. Item 7 - MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIAL CONDITION AND RESULTS OF OPERATIONS

     Type: item, Length: 33,154 chars

  9. Item 7A - QUANTITATIVE AND QUALITATIVE DISCLOSURES ABOUT MARKET RISK

     Type: item, Length: 6,799 chars

  10. Item 8 - FINANCIAL STATEMENTS AND SUPPLEMENTARY DATA

     Type: i

INFO:__main__:Created 210 chunks for AMZN_10K_2023-02-03.txt
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 11 unique sections:
INFO:__main__:  1: Item/Part 1 - Financial Statements...
INFO:__main__:  2: Item/Part unknown - Legal Proceedings...
INFO:__main__:  3: Item/Part 2 - Management’s Discussion and Analysis of Financial Condition ...
INFO:__main__:  4: Item/Part 3 - Quantitative and Qualitative Disclosures About Market Risk...
INFO:__main__:  5: Item/Part 4 - Controls and Procedures...
INFO:__main__:  6: Item/Part 1 - Legal Proceedings...
INFO:__main__:  7: Item/Part 1A - Risk Factors...
INFO:__main__:  8: Item/Part 2 - Unregistered Sales of Equity Securities and Use of Proceeds...
INFO:__main__:  9: Item/Part 3 - Defaults Upon Senior Securities...
INFO:__main__:  10: Item/Part 5 - Other Information...
INFO:__main__:  11: Item/Part 6 - Exhibits...
INFO:__main__:Universal detection successful (Strategy 1): Found 11 sections.



📊 Processing Results:

  total_chunks: 210

  avg_tokens: 332.1666666666667

  min_tokens: 6

  max_tokens: 1157

  chunks_with_overlap: 119

  table_chunks: 90

  narrative_chunks: 120

  unique_sections: 1


📚 Section Distribution (sample):

  • Full Document: 20 chunks


🧪 Testing: processed_filings/AMZN/AMZN_10Q_2024-11-01.txt


✅ Found 11 sections:

  1. Item 1 - FINANCIAL STATEMENTS

     Type: item, Length: 34,940 chars

  2. Legal Proceedings

     Type: named_section, Length: 32,116 chars

  3. Item 2 - MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIAL CONDITION AND RESULTS OF OPERATIONS

     Type: item, Length: 45,107 chars

  4. Item 3 - QUANTITATIVE AND QUALITATIVE DISCLOSURES ABOUT MARKET RISK

     Type: item, Length: 4,405 chars

  5. Item 4 - CONTROLS AND PROCEDURES

     Type: item, Length: 2,104 chars

  6. Item 1 - LEGAL PROCEEDINGS

     Type: item, Length: 162 chars

  7. Item 1A - RISK FACTORS

     Type: item, Length: 59,433 chars

  8. Item 2 - UNREGISTERED SA

INFO:__main__:  1: Item/Part 1 - Business...
INFO:__main__:  2: Item/Part 1A - Risk Factors...
INFO:__main__:  3: Item/Part 1B - Unresolved Staff Comments...
INFO:__main__:  4: Item/Part 3 - Legal Proceedings...
INFO:__main__:  5: Item/Part 4 - Mine Safety Disclosures...
INFO:__main__:  6: Item/Part 5 - Market for Registrant’s Common Equity, Related Stockholder M...
INFO:__main__:  7: Item/Part 6 - Selected Financial Data...
INFO:__main__:  8: Item/Part 7 - Management’s Discussion and Analysis of Financial Condition ...
INFO:__main__:  9: Item/Part 7A - Quantitative and Qualitative Disclosures About Market Risk...
INFO:__main__:  10: Item/Part 8 - Financial Statements and Supplementary Data...
INFO:__main__:  11: Item/Part 9 - Changes in and Disagreements with Accountants on Accounting ...
INFO:__main__:  12: Item/Part 9A - Controls and Procedures...
INFO:__main__:  13: Item/Part 9B - Other Information...
INFO:__main__:  14: Item/Part 11 - Executive Compensation...
INFO:__main__:  15: 


📊 Comparison Results:

  Old detection: 19 sections

  Universal detection: 19 sections

  Improvement: +0 sections


📋 Old Sections:

  1. Part I

  2. Item 1A

  3. Item 1B

  4. Item 3

  5. Item 4

  6. Item 6

  7. Item 7

  8. Item 7A

  9. Item 8

  10. Notes to Consolidated Financial Statements

  11. Opinion on the Financial Statements

  12. Item 9

  13. Item 9B

  14. Item 11

  15. Item 12

  16. Item 13

  17. Item 14

  18. Part IV

  19. Item 16


📋 Universal Sections:

  1. Item 1 - BUSINESS

  2. Item 1A - RISK FACTORS

  3. Item 1B - UNRESOLVED STAFF COMMENTS

  4. Item 3 - LEGAL PROCEEDINGS

  5. Item 4 - MINE SAFETY DISCLOSURES

  6. Item 5 - MARKET FOR REGISTRANT’S COMMON EQUITY, RELATED STOCKHOLDER MATTERS AND ISSUER PURCHASES OF EQUITY SECURITIES

  7. Item 6 - SELECTED FINANCIAL DATA

  8. Item 7 - MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIAL CONDITION AND RESULTS OF OPERATIONS

  9. Item 7A - QUANTITATIVE AND QUALITATIVE DISCLOSURES ABOUT MARKET RISK

  

In [6]:
def create_section_info(section: DocumentSection, form_type: str) -> str:
    """
    Create human-readable section information for DocumentSection objects,
    using form_type to select the correct item name map.
    Handles 10K/10Q specific mappings and part/item inheritance.
    """
    item_number = section.item_number
    section_type = section.section_type
    part_number = section.part # This will be like "PART I", "PART II" or None

    if section_type == 'item' and item_number:
        if form_type == '10K':
            item_name = ITEM_NAME_MAP_10K.get(item_number, "Unknown Section")
            return f"Item {item_number} - {item_name}"
        elif form_type == '10Q':
            # Prioritize using the part_number if available from section detection
            if part_number == 'PART I':
                item_name = ITEM_NAME_MAP_10Q_PART_I.get(item_number, "Unknown Section")
                return f"Part I, Item {item_number} - {item_name}"
            elif part_number == 'PART II':
                item_name = ITEM_NAME_MAP_10Q_PART_II.get(item_number, "Unknown Section")
                return f"Part II, Item {item_number} - {item_name}"
            else:
                # Fallback if part not clearly identified, try both maps
                if item_number in ITEM_NAME_MAP_10Q_PART_I:
                    item_name = ITEM_NAME_MAP_10Q_PART_I[item_number]
                    return f"Part I, Item {item_number} - {item_name}"
                elif item_number in ITEM_NAME_MAP_10Q_PART_II:
                    item_name = ITEM_NAME_MAP_10Q_PART_II[item_number]
                    return f"Part II, Item {item_number} - {item_name}"
                return f"Item {item_number} - Unknown 10Q Section"
    
    elif section_type == 'part' and part_number:
        # If it's a PART section itself, format it. Includes cases where title might capture an item.
        if "Item" in section.title and section.item_number:
            # Example: "PART I - Item 1. Financial Statements" if detected that way
            # Strip "PART X" from the original title if it's already there to avoid duplication
            clean_title_suffix = section.title.replace(part_number, '').strip(' -.')
            return f"{part_number} - {clean_title_suffix}"
        return part_number # Just returns "PART I", "PART II" etc.

    # Fallback for named_section (e.g., "BUSINESS" without Item number), 'content', or 'document' types
    return section.title or "Document Content"


def detect_sections_universal_sec(content: str) -> List[DocumentSection]:
    """
    Universal section detection for SEC filings with table-based formatting.
    Improved regex patterns for better capture of Item/Part numbers and titles.
    """
    sections = []

    if not content:
        logger.info("Empty content provided to detect_sections_universal_sec. Returning empty sections.")
        return sections

    # Universal patterns for table-formatted SEC filings
    # Using raw strings `r` and explicitly handling whitespace `\s*` and literal characters.
    # Compiling patterns once for efficiency.
    patterns = [
        # Table-based ITEM patterns: e.g., "[TABLE_START] Item 1. | Business..."
        re.compile(r'(?i)\[TABLE_START\]\s*Item\s*(\d{1,2}[A-C]?)\.?\s*\|\s*([^\[]+?)\s*\[TABLE_END\]', re.DOTALL),
        re.compile(r'(?i)\[TABLE_START\]\s*Item\s*(\d{1,2}[A-C]?)\.?\s*\|\s*([^|]+)', re.DOTALL),

        # Table-based PART patterns: e.g., "[TABLE_START] PART I | FINANCIAL INFORMATION..."
        re.compile(r'(?i)\[TABLE_START\]\s*PART\s*([IVX]+)\s*\|\s*([^\[]+?)\s*\[TABLE_END\]', re.DOTALL),
        re.compile(r'(?i)\[TABLE_START\]\s*PART\s*([IVX]+)\s*\|\s*([^|]+)', re.DOTALL),
        re.compile(r'(?i)\[TABLE_START\]\s*PART\s*([IVX]+)\s*\[TABLE_END\]', re.DOTALL),

        # Standalone ITEM patterns (strong indicators, start of line): e.g., "Item 1. Business"
        re.compile(r'^\s*Item\s*(\d{1,2}[A-C]?)\.?\s*([^\n]+)', re.I | re.M),
        # Standalone ITEM patterns (pipe-separated but not necessarily table-wrapped): e.g., "Item 1. | Business"
        re.compile(r'Item\s*(\d{1,2}[A-C]?)\.?\s*\|\s*([^|]+)', re.I | re.DOTALL),

        # Standalone PART patterns (strong indicators, start of line): e.g., "PART I. FINANCIAL INFORMATION"
        re.compile(r'^\s*PART\s*([IVX]+)\.?\s*([^\n]*)', re.I | re.M),
        # Standalone PART patterns (pipe-separated): e.g., "PART I | FINANCIAL INFORMATION"
        re.compile(r'PART\s*([IVX]+)\s*\|\s*([^|]+)', re.I | re.DOTALL),

        # Number-dot format (e.g., "1. Business" not necessarily preceded by "Item", usually at start of line)
        re.compile(r'^\s*(\d{1,2}[A-C]?)\.\s+[A-Z][A-Za-z\s]{10,}', re.I | re.M),
        # Number-only pattern in tables (e.g., "[TABLE_START] 1. | Business")
        re.compile(r'(?i)\[TABLE_START\]\s*(\d{1,2}[A-C]?)\.?\s*\|\s*([^|]+)', re.I | re.DOTALL),

        # Generic Section Titles that often appear as headers (e.g., "BUSINESS", "RISK FACTORS")
        re.compile(r'^\s*(BUSINESS|RISK FACTORS|LEGAL PROCEEDINGS|FINANCIAL STATEMENTS|MANAGEMENT\'S DISCUSSION AND ANALYSIS OF FINANCIAL CONDITION AND RESULTS OF OPERATIONS|PROPERTIES|CONTROLS AND PROCEDURES)\s*$', re.I | re.M)
    ]

    all_matches = []

    for pattern_idx, pattern in enumerate(patterns):
        for match in pattern.finditer(content):
            # Determine content boundaries for the "line" containing the match
            line_start = content.rfind('\n', 0, match.start()) + 1
            line_end = content.find('\n', match.end())
            if line_end == -1:
                line_end = len(content)

            full_line = content[line_start:line_end].strip()

            # Filter out obvious false positives (e.g., content that looks like a header but isn't)
            if (len(full_line) > 400 or  # Too long to be a header
                len(full_line) < 3 or    # Too short (e.g., just "1.")
                ('TABLE' in full_line.upper() and ('START' in full_line.upper() or 'END' in full_line.upper())) or # Exclude table markers if not part of a valid section header
                full_line.count(' ') > 20):  # Too many words, likely not a header
                continue

            # Heuristic to filter out TOC entries that might match general patterns
            if any(toc_indicator in full_line.lower() for toc_indicator in ['table of contents', 'index']):
                continue
            
            section_id = None
            section_title = full_line # Default to full line if specific extraction fails

            groups = match.groups()
            if groups:
                potential_id = groups[0].strip()
                # Determine if the first captured group is a valid Item/Part ID
                is_item_id = re.match(r'^\d+[A-C]?$', potential_id, re.I)
                is_part_id = re.match(r'^[IVX]+$', potential_id, re.I)

                if is_item_id or is_part_id:
                    section_id = potential_id
                    if len(groups) > 1 and groups[1]: # If a title group was also captured
                        section_title = groups[1].strip()
                        # Clean up title: remove trailing table markers like "[TABLE_END]" if they were captured
                        section_title = re.sub(r'\[TABLE_END\]\s*.*', '', section_title, flags=re.I).strip()
                        section_title = section_title.replace('|', '').strip() # Remove pipe characters
                    else: # No explicit title captured by a group
                        # Try to extract a clean title from the remainder of the line after the ID
                        remaining_line_after_id = full_line[match.end() - line_start:].strip()
                        clean_line = re.sub(r'^\s*\.?\s*[-–—]?\s*', '', remaining_line_after_id).strip()
                        if clean_line and len(clean_line) < 200: # Ensure extracted title isn't too long
                            section_title = clean_line
                        else:
                             section_title = full_line # Fallback to full line if cleaning is problematic
                else: # First captured group was not a standard Item/Part ID, treat as part of title
                    section_title = full_line
                    # For generic named sections (e.g., "BUSINESS"), assign a canonical ID if not part of an Item/Part already
                    if 'BUSINESS' in full_line.upper() and not is_item_id and not is_part_id: section_id = '1'
                    elif 'RISK FACTORS' in full_line.upper() and not is_item_id and not is_part_id: section_id = '1A'
                    # Add other named section mappings if needed.

            # Store the original start/end of the line for correct content extraction
            all_matches.append({
                'start_pos': line_start,
                'end_pos': line_end,
                'full_line': full_line,
                'section_id': section_id if section_id else 'unknown',
                'section_title': section_title,
                'pattern_idx': pattern_idx,
                'match_start': match.start()
            })

    # Sort matches primarily by start_pos, secondarily by pattern_idx (to prefer more specific patterns early in the list)
    all_matches.sort(key=lambda x: (x['start_pos'], x['pattern_idx']))

    # Filter duplicate/overlapping matches. Prioritize more specific patterns (lower pattern_idx).
    final_matches = []
    if all_matches:
        final_matches.append(all_matches[0])
        for i in range(1, len(all_matches)):
            current_match = all_matches[i]
            last_added_match = final_matches[-1]

            # If current match starts very close to the last added match,
            # consider if it's a duplicate or a better alternative.
            if current_match['start_pos'] - last_added_match['start_pos'] < 100: # Within 100 chars
                # Prefer matches with a specific Item/Part ID over 'unknown' or less specific types
                if current_match['section_id'] != 'unknown' and last_added_match['section_id'] == 'unknown':
                    final_matches[-1] = current_match
                # If both are specific, prefer the one matched by a higher-priority pattern (lower index means earlier in list)
                elif current_match['section_id'] != 'unknown' and last_added_match['section_id'] != 'unknown' and current_match['pattern_idx'] < last_added_match['pattern_idx']:
                    final_matches[-1] = current_match
                # If they have the same ID but the new match offers a cleaner/more robust title
                elif current_match['section_id'] == last_added_match['section_id'] and len(current_match['section_title']) < len(last_added_match['section_title']) * 0.8: # Heuristic for "cleaner"
                     final_matches[-1] = current_match
                # Otherwise, if it's too close and not a better candidate, skip as duplicate
            else:
                final_matches.append(current_match) # Add if sufficiently far apart

    logger.info(f"🔍 Universal SEC detection found {len(final_matches)} unique sections:")
    for i, match in enumerate(final_matches[:15]):
        logger.info(f"  {i+1}: Item/Part {match['section_id']} - {match['section_title'][:60]}...")

    # Convert to DocumentSection objects
    final_document_sections = []
    current_part = None # Track current part for 10Q item context

    for i, match in enumerate(final_matches):
        start_pos = match['start_pos']
        end_pos = final_matches[i + 1]['start_pos'] if i + 1 < len(final_matches) else len(content)

        section_content = content[start_pos:end_pos].strip()

        section_id = match['section_id'].upper()
        title = match['section_title']

        section_type = 'content' # Default type
        item_number = None
        part = None

        if re.match(r'^[IVX]+$', section_id):
            section_type = 'part'
            part = f"PART {section_id}"
            current_part = part # Update current part for subsequent items
            # Refine title: remove "PART X" if it's already in the title to avoid redundancy.
            clean_title_part = title.upper().replace(part, '').strip(' -.')
            if clean_title_part:
                title = f"{part} - {clean_title_part}"
            else:
                title = part # Fallback to just "PART X"
        elif re.match(r'^\d+[A-C]?$', section_id):
            section_type = 'item'
            item_number = section_id
            part = current_part # Assign current part context to this item (inherited)
            # Refine title: remove "Item X" if it's already in the title
            clean_title_item = title.upper().replace(f"ITEM {item_number}", '').strip(' -.')
            if clean_title_item:
                title = f"Item {item_number} - {clean_title_item}"
            else:
                title = f"Item {item_number}" # Fallback to just "Item X"
        # For named_section (e.g., "BUSINESS" when it's not explicitly an Item number)
        elif any(keyword in title.upper() for keyword in ['BUSINESS', 'RISK FACTORS', 'LEGAL PROCEEDINGS', 'FINANCIAL STATEMENTS', 'MANAGEMENT\'S DISCUSSION', 'PROPERTIES', 'CONTROLS AND PROCEDURES']):
            section_type = 'named_section'


        final_document_sections.append(DocumentSection(
            title=title,
            content=section_content,
            section_type=section_type,
            item_number=item_number,
            part=part, # Store the part info (either detected directly or inherited)
            start_pos=start_pos,
            end_pos=end_pos
        ))

    return final_document_sections

def detect_sections_from_toc_universal(content: str) -> List[DocumentSection]:
    """
    Extract sections from table of contents - works for any SEC filing.
    This function primarily identifies section titles and item numbers from TOC,
    but does not extract their content directly.
    """
    sections = []

    if not content:
        logger.info("Empty content provided to detect_sections_from_toc_universal. Returning empty sections.")
        return sections

    # Look for table of contents patterns. Using re.escape for literal parts.
    toc_patterns = [
        re.compile(r'(?i)INDEX.*?(?=\s*--- PAGE BREAK ---)', re.DOTALL),
        re.compile(r'(?i)TABLE OF CONTENTS.*?(?=\s*--- PAGE BREAK ---)', re.DOTALL),
        re.compile(r'(?i)FORM 10-[KQ].*?INDEX.*?(?=\s*--- PAGE BREAK ---)', re.DOTALL),
        re.compile(re.escape('[TABLE_START]') + r'.*?Page.*?' + re.escape('[TABLE_END]') + r'.*?(?=\s*--- PAGE BREAK ---)', re.DOTALL),
    ]

    toc_content = ""
    for pattern in toc_patterns:
        match = pattern.search(content)
        if match:
            toc_content = match.group(0)
            break

    if not toc_content:
        logger.warning("No table of contents found in detect_sections_from_toc_universal.")
        return sections

    logger.info(f"Found table of contents ({len(toc_content)} chars)")

    # Define patterns for items/parts within the TOC
    # CORRECTED: Significant refinement here. Relaxed whitespace, optional periods,
    # and ensured capture groups correctly isolate ID and title without extra junk.
    item_patterns = [
        # Pattern 1: Page | PART/ITEM | Item_ID. | Title | Page_Num
        # e.g., "Page | PART I. FINANCIAL INFORMATION | Item 1. | Financial Statements | 3"
        re.compile(r'(?i)(?:Page\s*\|)?\s*(PART\s*[IVX]+)\s*\.?\s*([^|]+?)\s*\|\s*Item\s*(\d{1,2}[A-C]?)\.?\s*\|\s*([^|]+?)(?:\s*\|\s*\d+)?', re.M),
        # Pattern 2: PART/ITEM | Title | Page_Num (Most common, like KO, simpler AMZN)
        # e.g., "Part I. Financial Information | 1 | Item 1. | Financial Statements (Unaudited) | 2" (This is tough, let's try to get clean item/title)
        # Or: "Item 1. | Financial Statements | 3" or "PART I | FINANCIAL INFORMATION | 3"
        re.compile(r'(?i)(?:Item|PART)\s*(\d{1,2}[A-C]?|[IVX]+)\.?\s*\|\s*([^\n|]+?)(?:\s*\|\s*\d+)?', re.M),
        # Pattern 3: Simple Item/Part ID then Title, possibly on same line, no pipes to separate title
        # e.g., "Item 1A. Risk Factors" or "PART II. OTHER INFORMATION"
        re.compile(r'(?i)^\s*(?:Item|PART)\s*(\d{1,2}[A-C]?|[IVX]+)\.?\s*([^\n|]+)', re.M),
        # Pattern 4: TOC lines that are just titles, potentially indented, often sub-sections
        # e.g., "Consolidated Statements of Cash Flows | 3" (these don't have Item #, but are important sections)
        re.compile(r'^\s*([A-Z][A-Za-z\s-,\']{10,})\s*(?:\|\s*\d+)?$', re.M), # Captures longer phrases as titles
        # Pattern 5: Number-dot format (e.g., "1. Business") usually at start of line
        re.compile(r'^\s*(\d{1,2}[A-C]?)\.\s*([^\n|]+)', re.M),
    ]


    found_items = []
    current_part_id = None # To associate items without explicit part with the last seen part
    
    if toc_content: # Only try to find items if TOC content was found
        # Iterate over lines for better context and to handle multi-column TOCs more flexibly
        toc_lines = toc_content.split('\n')
        for line in toc_lines:
            line = line.strip()
            if not line:
                continue

            for pattern in item_patterns:
                match = pattern.search(line) # Search within the line
                if match:
                    groups = match.groups()
                    item_id = None
                    item_title = ""

                    # Determine item_id and item_title based on pattern groups
                    # This logic needs to be carefully adjusted per pattern.
                    if pattern == item_patterns[0]: # Page | PART/ITEM | Item_ID. | Title | Page_Num
                        # Group 1: PART ID, Group 2: PART Title, Group 3: Item ID, Group 4: Item Title
                        part_candidate = groups[0].strip() if groups[0] else None
                        part_title_candidate = groups[1].strip() if groups[1] else None
                        item_id = groups[2].strip() if groups[2] else None
                        item_title = groups[3].strip() if groups[3] else ""

                        if part_candidate: # If a PART was captured first, record it
                            current_part_id = part_candidate
                            found_items.append((part_candidate, part_title_candidate, 'part')) # Record PART entry

                        if item_id: # Then record the ITEM entry
                            found_items.append((item_id, item_title, 'item'))
                            break # Found a match for this line, move to next line
                    
                    elif pattern == item_patterns[1] or pattern == item_patterns[2] or pattern == item_patterns[4]: # Item/PART | Title | Page, or Item/PART. Title
                        # For these patterns, group 1 is ID, group 2 is title (or part of it)
                        item_id = groups[0].strip() if groups[0] else None
                        item_title = groups[1].strip() if len(groups) > 1 and groups[1] else "" # Title captured by second group

                        if item_id:
                            clean_item_title = re.sub(r'\|\s*\d+', '', item_title).strip() # Remove "| page"
                            clean_item_title = re.sub(r'\s*\.\s*$', '', clean_item_title).strip() # Remove trailing periods
                            clean_item_title = re.sub(r'\s+', ' ', clean_item_title).strip() # Normalize whitespace
                            
                            section_type = 'item' if re.match(r'^\d+[A-C]?$', item_id, re.I) else 'part'
                            found_items.append((item_id, clean_item_title, section_type))
                            
                            if section_type == 'part':
                                current_part_id = f"PART {item_id}" # Update current part for subsequent items

                            break # Found a match for this line, move to next line
                    
                    elif pattern == item_patterns[3]: # Number-dot format in TOC
                        item_id = groups[0].strip() if groups[0] else None
                        item_title = groups[1].strip() if len(groups) > 1 and groups[1] else ""
                        
                        if item_id and item_title:
                            clean_item_title = re.sub(r'\|\s*\d+', '', item_title).strip()
                            clean_item_title = re.sub(r'\s*\.\s*$', '', clean_item_title).strip()
                            clean_item_title = re.sub(r'\s+', ' ', clean_item_title).strip()
                            found_items.append((item_id, clean_item_title, 'item'))
                            break
                    
                    # For Pattern 4: Generic Section Titles (e.g., "Consolidated Statements of Cash Flows")
                    # These don't have item numbers directly in the TOC, but are important sections.
                    # We'll assign them a type 'named_section' and associate with current_part_id
                    elif pattern == item_patterns[4]: # This is the simple title match pattern (index 4)
                        item_title = groups[0].strip() # This is the entire captured title
                        if item_title and len(item_title) > 10: # Heuristic: only consider if title is reasonably long
                             clean_item_title = re.sub(r'\|\s*\d+', '', item_title).strip()
                             clean_item_title = re.sub(r'\s*\.\s*$', '', clean_item_title).strip()
                             clean_item_title = re.sub(r'\s+', ' ', clean_item_title).strip()
                             # Assign a dummy ID if no specific item/part ID is present for sorting later.
                             # Or better, keep item_id as None and let create_section_info handle it.
                             found_items.append((None, clean_item_title, 'named_section'))
                             break # Found a match for this line, move to next line

    # Refined deduplication and final DocumentSection creation
    unique_items = []
    seen_keys = set()
    
    # Process found_items to clean and add part context
    processed_items = []
    current_part_for_items = None
    for item_id, title, section_type_raw in found_items:
        if section_type_raw == 'part':
            current_part_for_items = f"PART {item_id}"
            processed_items.append((item_id, title, 'part', current_part_for_items))
        elif section_type_raw == 'item':
            processed_items.append((item_id, title, 'item', current_part_for_items))
        else: # named_section or unknown type
            processed_items.append((item_id, title, section_type_raw, current_part_for_items))

    # Sort processed items to handle nesting logic: parts first, then items within parts
    # Sort key: (is_part, part_roman_value, item_num_value, item_alpha_value, title)
    # This is a bit complex for a simple sort, so a robust sort might be iterative or based on final DocumentSection structure.
    # For now, let's rely on the deduplication for unique_items below.

    for item_id, title, section_type_raw, part_context in processed_items:
        # Create a key that is robust for deduplication
        key = (item_id, title, section_type_raw, part_context)
        if key not in seen_keys:
            unique_items.append(DocumentSection(
                title=title,
                content="", # Content still empty, to be filled by main strategy
                section_type=section_type_raw,
                item_number=item_id if section_type_raw == 'item' else None,
                part=part_context if section_type_raw == 'item' else (f"PART {item_id}" if section_type_raw == 'part' else None),
                start_pos=0, # Not set from TOC
                end_pos=0    # Not set from TOC
            ))
            seen_keys.add(key)
    
    # Sort the final unique_items list. This is crucial for content mapping later.
    # A simple approach: sort by Item/Part ID, then by title.
    unique_items.sort(key=lambda x: (x.part if x.part else '', x.item_number if x.item_number else '', x.title))


    logger.info(f"Extracted {len(unique_items)} sections from table of contents:")
    for i, sec in enumerate(unique_items[:10]):
        logger.info(f"  • {sec.item_number if sec.item_number else sec.part if sec.part else 'NoID'}: {sec.title[:50]}...")

    return unique_items # Return DocumentSection objects directly

def detect_sections_robust_universal(content: str) -> List[DocumentSection]:
    """
    Universal robust section detection for all SEC filings.
    Prioritizes direct pattern matching (which handles tables well), then TOC, then page-based.
    """
    logger.info("Attempting universal SEC section detection")

    # Strategy 1: Direct pattern matching for sections (designed to work well with common SEC patterns)
    sections_strategy1 = detect_sections_universal_sec(content)

    if len(sections_strategy1) >= 3:
        logger.info(f"Universal detection successful (Strategy 1): Found {len(sections_strategy1)} sections.")
        return sections_strategy1

    # Strategy 2: Try parsing Table of Contents.
    logger.warning("Direct detection found few sections, analyzing table of contents.")
    toc_entries = detect_sections_from_toc_universal(content) # These are DocumentSections with only title/metadata, no content

    if toc_entries and len(toc_entries) >= 3: # If TOC parsing yielded a good number of entries
        logger.info(f"TOC analysis found {len(toc_entries)} potential sections. Attempting to extract content based on TOC titles.")

        combined_sections = []
        current_content_pos = 0

        # Create flexible regex for the title/item number to find it in the main content
        for i, toc_entry in enumerate(toc_entries):
            pattern_parts = []
            
            # Prioritize matching by Item/Part numbers as they are more consistent
            if toc_entry.item_number:
                pattern_parts.append(r'Item\s*' + re.escape(toc_entry.item_number) + r'\.?')
            if toc_entry.part:
                pattern_parts.append(r'PART\s*' + re.escape(toc_entry.part.replace("PART ", "")))
            
            # Add the full cleaned title as a fallback/alternative match
            if toc_entry.title:
                # Clean title for regex matching in content (remove page numbers, excess pipes, etc.)
                cleaned_title_for_regex = re.sub(r'\|\s*\d+', '', toc_entry.title).strip() # Remove "| PageNumber"
                cleaned_title_for_regex = re.sub(r'\s*\.\s*$', '', cleaned_title_for_regex).strip() # Remove trailing periods
                cleaned_title_for_regex = re.sub(r'\s+', r'\s+', cleaned_title_for_regex) # Replace multiple spaces with \s+ for flexible matching
                pattern_parts.append(re.escape(cleaned_title_for_regex)) # re.escape the cleaned title
                
            if not pattern_parts:
                logger.warning(f"No valid pattern parts for TOC entry: '{toc_entry.title}'. Skipping.")
                continue

            # Combine all potential ways to match this section's header
            search_pattern = re.compile(r'(?i)^\s*(?:' + '|'.join(pattern_parts) + r')', re.M)
            
            match = search_pattern.search(content, pos=current_content_pos)

            if match:
                start_pos = match.start()
                
                next_start_pos = len(content)
                if i + 1 < len(toc_entries):
                    next_toc_entry = toc_entries[i+1]
                    next_pattern_parts = []
                    if next_toc_entry.item_number:
                        next_pattern_parts.append(r'Item\s*' + re.escape(next_toc_entry.item_number) + r'\.?')
                    if next_toc_entry.part:
                        next_pattern_parts.append(r'PART\s*' + re.escape(next_toc_entry.part.replace("PART ", "")))
                    if next_toc_entry.title:
                        next_cleaned_title_for_regex = re.sub(r'\|\s*\d+', '', next_toc_entry.title).strip()
                        next_cleaned_title_for_regex = re.sub(r'\s*\.\s*$', '', next_cleaned_title_for_regex).strip()
                        next_cleaned_title_for_regex = re.sub(r'\s+', r'\s+', next_cleaned_title_for_regex)
                        next_pattern_parts.append(re.escape(next_cleaned_title_for_regex))

                    if next_pattern_parts:
                        next_pattern = re.compile(r'(?i)^\s*(?:' + '|'.join(next_pattern_parts) + r')', re.M)
                        next_match = next_pattern.search(content, pos=match.end()) # Search from end of current match
                        if next_match:
                            next_start_pos = next_match.start()
                
                section_content = content[start_pos:next_start_pos].strip()
                
                combined_sections.append(DocumentSection(
                    title=toc_entry.title,
                    content=section_content,
                    section_type=toc_entry.section_type,
                    item_number=toc_entry.item_number,
                    part=toc_entry.part, # Preserve the part info derived from TOC
                    start_pos=start_pos,
                    end_pos=next_start_pos
                ))
                current_content_pos = next_start_pos
            else:
                logger.warning(f"Could not find content for TOC entry: '{toc_entry.title}'. This section might be merged with previous or skipped.")

        if len(combined_sections) >= 3:
            logger.info(f"Universal detection successful (TOC-based content mapping): Found {len(combined_sections)} sections.")
            return combined_sections
        else:
            logger.warning("TOC-based content mapping yielded few sections. Falling back to page-based detection.")


    # Strategy 3: Page-based fallback (original strategy 2)
    logger.warning("Trying page-based detection as fallback.")
    sections_strategy2 = detect_sections_strategy_2(content)

    if len(sections_strategy2) >= 2:
        logger.info(f"Page-based detection successful: Found {len(sections_strategy2)} sections.")
        return sections_strategy2

    # Final fallback: return the entire document as a single section
    logger.warning("All strategies failed, creating single section.")
    return [DocumentSection(
        title="Full Document",
        content=content,
        section_type='document',
        start_pos=0,
        end_pos=len(content)
    )]

In [7]:
results_universal = test_universal_detection_fixed()
old_vs_new_sections = compare_old_vs_universal_fixed()
quick_pattern_test_fixed()

INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 19 unique sections:
INFO:__main__:  1: Item/Part 1 - Business...
INFO:__main__:  2: Item/Part 1A - Risk Factors...
INFO:__main__:  3: Item/Part 1B - Unresolved Staff Comments...
INFO:__main__:  4: Item/Part 3 - Legal Proceedings...
INFO:__main__:  5: Item/Part 4 - Mine Safety Disclosures...
INFO:__main__:  6: Item/Part 5 - Market for Registrant’s Common Equity, Related Stockholder M...
INFO:__main__:  7: Item/Part 6 - Selected Financial Data...
INFO:__main__:  8: Item/Part 7 - Management’s Discussion and Analysis of Financial Condition ...
INFO:__main__:  9: Item/Part 7A - Quantitative and Qualitative Disclosures About Market Risk...
INFO:__main__:  10: Item/Part 8 - Financial Statements and Supplementary Data...
INFO:__main__:  11: Item/Part 9 - Changes in and Disagreements with Accountants on Accounting ...
INFO:__main__:  12: Item/Part 9A - Controls and Procedures...
INFO:__main__:


🧪 Testing: processed_filings/AAPL/AAPL_10K_2020-10-30.txt


✅ Found 19 sections:

  1. Item 1 - BUSINESS

     Type: item, Length: 13,266 chars

  2. Item 1A - RISK FACTORS

     Type: item, Length: 61,136 chars

  3. Item 1B - UNRESOLVED STAFF COMMENTS

     Type: item, Length: 582 chars

  4. Item 3 - LEGAL PROCEEDINGS

     Type: item, Length: 898 chars

  5. Item 4 - MINE SAFETY DISCLOSURES

     Type: item, Length: 108 chars

  6. Item 5 - MARKET FOR REGISTRANT’S COMMON EQUITY, RELATED STOCKHOLDER MATTERS AND ISSUER PURCHASES OF EQUITY SECURITIES

     Type: item, Length: 4,182 chars

  7. Item 6 - SELECTED FINANCIAL DATA

     Type: item, Length: 1,745 chars

  8. Item 7 - MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIAL CONDITION AND RESULTS OF OPERATIONS

     Type: item, Length: 33,154 chars

  9. Item 7A - QUANTITATIVE AND QUALITATIVE DISCLOSURES ABOUT MARKET RISK

     Type: item, Length: 6,799 chars

  10. Item 8 - FINANCIAL STATEMENTS AND SUPPLEMENTARY DATA

     Type: i

INFO:__main__:Strategy 1 successful: Found 19 sections
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 19 unique sections:
INFO:__main__:  1: Item/Part 1 - Business...
INFO:__main__:  2: Item/Part 1A - Risk Factors...
INFO:__main__:  3: Item/Part 1B - Unresolved Staff Comments...
INFO:__main__:  4: Item/Part 3 - Legal Proceedings...
INFO:__main__:  5: Item/Part 4 - Mine Safety Disclosures...
INFO:__main__:  6: Item/Part 5 - Market for Registrant’s Common Equity, Related Stockholder M...
INFO:__main__:  7: Item/Part 6 - Selected Financial Data...
INFO:__main__:  8: Item/Part 7 - Management’s Discussion and Analysis of Financial Condition ...
INFO:__main__:  9: Item/Part 7A - Quantitative and Qualitative Disclosures About Market Risk...
INFO:__main__:  10: Item/Part 8 - Financial Statements and Supplementary Data...
INFO:__main__:  11: Item/Part 9 - Changes in and Disagreements with Accountants on Accounting ...
INFO:__main__:  12: I

🔍 Improved detection found 19 potential sections:
  1: PART I...
  2: Item 1A.    Risk Factors...
  3: Item 1B.    Unresolved Staff Comments...
  4: Item 3.    Legal Proceedings...
  5: Item 4.    Mine Safety Disclosures...
  6: Item 6.    Selected Financial Data...
  7: Item 7.    Management’s Discussion and Analysis of Financial Condition and Resul...
  8: Item 7A.    Quantitative and Qualitative Disclosures About Market Risk...
  9: Item 8.    Financial Statements and Supplementary Data...
  10: Notes to Consolidated Financial Statements...
  11: Opinion on the Financial Statements...
  12: Item 9.    Changes in and Disagreements with Accountants on Accounting and Finan...
  13: Item 9B.    Other Information...
  14: Item 11.    Executive Compensation...
  15: Item 12.    Security Ownership of Certain Beneficial Owners and Management and R...
Running universal detection...


📊 Comparison Results:

  Old detection: 19 sections

  Universal detection: 19 sections

  Improvement: +0 se

In [8]:
import os
import re
import pandas as pd
import tiktoken
from typing import List, Dict, Any, Tuple, Optional
from dataclasses import dataclass
from datetime import datetime
import logging
from pathlib import Path

# Set up logging to see what's happening
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Initialize tokenizer for accurate token counting
encoding = tiktoken.encoding_for_model("text-embedding-3-small")

# =============================================================================
# 1. SEC MAPPINGS WITH FALLBACKS
# =============================================================================

ITEM_NAME_MAP_10K = {
    "1": "Business",
    "1A": "Risk Factors",
    "1B": "Unresolved Staff Comments",
    "1C": "Cybersecurity",
    "2": "Properties",
    "3": "Legal Proceedings",
    "4": "Mine Safety Disclosures",
    "5": "Market for Registrant's Common Equity, Related Stockholder Matters and Issuer Purchases of Equity Securities",
    "6": "Reserved",
    "7": "Management's Discussion and Analysis of Financial Condition and Results of Operations",
    "7A": "Quantitative and Qualitative Disclosures About Market Risk",
    "8": "Financial Statements and Supplementary Data",
    "9": "Changes in and Disagreements With Accountants on Accounting and Financial Disclosure",
    "9A": "Controls and Procedures",
    "9B": "Other Information",
    "9C": "Disclosure Regarding Foreign Jurisdictions that Prevent Inspections",
    "10": "Directors, Executive Officers and Corporate Governance",
    "11": "Executive Compensation",
    "12": "Security Ownership of Certain Beneficial Owners and Management and Related Stockholder Matters",
    "13": "Certain Relationships and Related Transactions, and Director Independence",
    "14": "Principal Accountant Fees and Services",
    "15": "Exhibits, Financial Statement Schedules",
    "16": "Form 10-K Summary"
}

ITEM_NAME_MAP_10Q_PART_I = {
    "1": "Financial Statements",
    "2": "Management's Discussion and Analysis of Financial Condition and Results of Operations",
    "3": "Quantitative and Qualitative Disclosures About Market Risk",
    "4": "Controls and Procedures",
}

ITEM_NAME_MAP_10Q_PART_II = {
    "1": "Legal Proceedings", "1A": "Risk Factors",
    "2": "Unregistered Sales of Equity Securities and Use of Proceeds",
    "3": "Defaults Upon Senior Securities", "4": "Mine Safety Disclosures",
    "5": "Other Information", "6": "Exhibits",
}

# =============================================================================
# 2. DATA STRUCTURES FOR BETTER ORGANIZATION
# =============================================================================

@dataclass
class FilingMetadata:
    """Structured metadata for a filing"""
    ticker: str
    form_type: str
    filing_date: str
    fiscal_year: int
    fiscal_quarter: int
    file_path: str

@dataclass
class DocumentSection:
    """Represents a section of the document"""
    title: str
    content: str
    section_type: str  # 'item', 'part', 'intro', 'table'
    item_number: Optional[str] = None
    part: Optional[str] = None
    start_pos: int = 0
    end_pos: int = 0

@dataclass
class Chunk:
    """Final chunk with all metadata"""
    chunk_id: str
    text: str
    token_count: int
    chunk_type: str  # 'narrative', 'table', 'mixed'
    section_info: str
    filing_metadata: FilingMetadata
    chunk_index: int
    has_overlap: bool = False

# =============================================================================
# 3. ROBUST TEXT CLEANING
# =============================================================================

def clean_sec_text(text: str) -> str:
    """
    Clean SEC filing text more robustly
    """
    # Remove common SEC artifacts
    text = re.sub(r'UNITED STATES\s+SECURITIES AND EXCHANGE COMMISSION.*?FORM \d+[A-Z]*', '', text, flags=re.DOTALL | re.IGNORECASE)

    # Handle page breaks more intelligently
    text = text.replace('[PAGE BREAK]', '\n\n--- PAGE BREAK ---\n\n')

    # Preserve table boundaries but clean them up
    text = re.sub(r'\[TABLE_START\]', '\n\n=== TABLE START ===\n', text)
    text = re.sub(r'\[TABLE_END\]', '\n=== TABLE END ===\n\n', text)

    # Clean up excessive whitespace but preserve paragraph structure
    text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text)  # Multiple newlines -> double newline
    text = re.sub(r'[ \t]+', ' ', text)  # Multiple spaces/tabs -> single space
    text = re.sub(r'^\s+|\s+$', '', text, flags=re.MULTILINE)  # Trim lines

    return text.strip()

# =============================================================================
# 4. MULTI-STRATEGY SECTION DETECTION
# =============================================================================

def detect_sections_strategy_1_improved(content: str) -> List[DocumentSection]:
    """
    Improved Strategy 1: Patterns based on real SEC filing structure
    """
    sections = []

    # Much more comprehensive patterns based on your actual files
    patterns = [
        # PART patterns - handle various formats
        re.compile(r'^\s*PART\s+([IVX]+)(?:\s*[-–—].*?)?$', re.I | re.M),
        re.compile(r'^PART\s+([IVX]+)(?:\s*[-–—].*?)?$', re.I | re.M),

        # ITEM patterns - much more flexible
        re.compile(r'^\s*ITEM\s+(\d{1,2}[A-C]?)(?:[.\s–—])', re.I | re.M),
        re.compile(r'^ITEM\s+(\d{1,2}[A-C]?)(?:[.\s–—])', re.I | re.M),
        re.compile(r'Item\s+(\d{1,2}[A-C]?)(?:[.\s–—])', re.I | re.M),

        # Number-dot format common in SEC filings
        re.compile(r'^(\d{1,2}[A-C]?)\.\s+[A-Z][A-Za-z\s]{10,}', re.I | re.M),

        # Content-based patterns for known sections
        re.compile(r'^.{0,50}(BUSINESS)\s*$', re.I | re.M),
        re.compile(r'^.{0,50}(RISK FACTORS)\s*$', re.I | re.M),
        re.compile(r'^.{0,50}(LEGAL PROCEEDINGS)\s*$', re.I | re.M),
        re.compile(r'^.{0,50}(FINANCIAL STATEMENTS)\s*$', re.I | re.M),
        re.compile(r'^.{0,50}(MANAGEMENT.S DISCUSSION)\s*', re.I | re.M),
        re.compile(r'^.{0,50}(PROPERTIES)\s*$', re.I | re.M),
        re.compile(r'^.{0,50}(CONTROLS AND PROCEDURES)\s*$', re.I | re.M),
    ]

    all_matches = []

    # Process each pattern
    for pattern_idx, pattern in enumerate(patterns):
        for match in pattern.finditer(content): # Use pre-compiled pattern
            # Get the full line containing this match
            line_start = content.rfind('\n', 0, match.start()) + 1
            line_end = content.find('\n', match.end())
            if line_end == -1:
                line_end = len(content)

            full_line = content[line_start:line_end].strip()

            # Filter out obvious false positives (e.g., content that looks like a header but isn't)
            if (len(full_line) > 400 or  # Too long to be a header
                len(full_line) < 3 or    # Too short (e.g., just "1.")
                ('TABLE' in full_line.upper() and ('START' in full_line.upper() or 'END' in full_line.upper())) or # Exclude table markers if not part of a valid section header
                full_line.count(' ') > 20):  # Too many words, likely not a header
                continue

            # Heuristic to filter out TOC entries that might match general patterns
            if any(toc_indicator in full_line.lower() for toc_indicator in ['table of contents', 'index']):
                continue
            
            section_id = None
            section_title = full_line # Default to full line if specific extraction fails

            groups = match.groups()
            if groups:
                potential_id = groups[0].strip()
                # Determine if the first captured group is a valid Item/Part ID
                is_item_id = re.match(r'^\d+[A-C]?$', potential_id, re.I)
                is_part_id = re.match(r'^[IVX]+$', potential_id, re.I)

                if is_item_id or is_part_id:
                    section_id = potential_id
                    if len(groups) > 1 and groups[1]: # If a title group was also captured
                        section_title = groups[1].strip()
                        # Clean up title: remove trailing table markers like "[TABLE_END]" if they were captured
                        section_title = re.sub(r'\[TABLE_END\]\s*.*', '', section_title, flags=re.I).strip()
                        section_title = section_title.replace('|', '').strip() # Remove pipe characters
                    else: # No explicit title captured by a group
                        # Try to extract a clean title from the remainder of the line after the ID
                        remaining_line_after_id = full_line[match.end() - line_start:].strip()
                        clean_line = re.sub(r'^\s*\.?\s*[-–—]?\s*', '', remaining_line_after_id).strip()
                        if clean_line and len(clean_line) < 200: # Ensure extracted title isn't too long
                            section_title = clean_line
                        else:
                             section_title = full_line # Fallback to full line if cleaning is problematic
                else: # First captured group was not a standard Item/Part ID, treat as part of title
                    section_title = full_line
                    # For generic named sections (e.g., "BUSINESS"), assign a canonical ID if not part of an Item/Part already
                    if 'BUSINESS' in full_line.upper() and not is_item_id and not is_part_id: section_id = '1'
                    elif 'RISK FACTORS' in full_line.upper() and not is_item_id and not is_part_id: section_id = '1A'
                    # Add other named section mappings if needed.

            # Store the original start/end of the line for correct content extraction
            all_matches.append({
                'start_pos': line_start,
                'end_pos': line_end,
                'full_line': full_line,
                'section_id': section_id if section_id else 'unknown',
                'section_title': section_title,
                'pattern_idx': pattern_idx,
                'match_start': match.start()
            })

    # Sort matches primarily by start_pos, secondarily by pattern_idx (to prefer more specific patterns early in the list)
    all_matches.sort(key=lambda x: (x['start_pos'], x['pattern_idx']))

    # Filter duplicate/overlapping matches. Prioritize more specific patterns (lower pattern_idx).
    final_matches = []
    if all_matches:
        final_matches.append(all_matches[0])
        for i in range(1, len(all_matches)):
            current_match = all_matches[i]
            last_added_match = final_matches[-1]

            # If current match starts very close to the last added match,
            # consider if it's a duplicate or a better alternative.
            if current_match['start_pos'] - last_added_match['start_pos'] < 100: # Within 100 chars
                # Prefer matches with a specific Item/Part ID over 'unknown' or less specific types
                if current_match['section_id'] != 'unknown' and last_added_match['section_id'] == 'unknown':
                    final_matches[-1] = current_match
                # If both are specific, prefer the one matched by a higher-priority pattern (lower index means earlier in list)
                elif current_match['section_id'] != 'unknown' and last_added_match['section_id'] != 'unknown' and current_match['pattern_idx'] < last_added_match['pattern_idx']:
                    final_matches[-1] = current_match
                # If they have the same ID but the new match offers a cleaner/more robust title
                elif current_match['section_id'] == last_added_match['section_id'] and len(current_match['section_title']) < len(last_added_match['section_title']) * 0.8: # Heuristic for "cleaner"
                     final_matches[-1] = current_match
                # Otherwise, if it's too close and not a better candidate, skip as duplicate
            else:
                final_matches.append(current_match) # Add if sufficiently far apart

    logger.info(f"🔍 Universal SEC detection found {len(final_matches)} unique sections:")
    for i, match in enumerate(final_matches[:15]):
        logger.info(f"  {i+1}: Item/Part {match['section_id']} - {match['section_title'][:60]}...")

    # Convert to DocumentSection objects
    final_document_sections = []
    current_part = None # Track current part for 10Q item context

    for i, match in enumerate(final_matches):
        start_pos = match['start_pos']
        end_pos = final_matches[i + 1]['start_pos'] if i + 1 < len(final_matches) else len(content)

        section_content = content[start_pos:end_pos].strip()

        section_id = match['section_id'].upper()
        title = match['section_title']

        section_type = 'content' # Default type
        item_number = None
        part = None

        if re.match(r'^[IVX]+$', section_id):
            section_type = 'part'
            part = f"PART {section_id}"
            current_part = part # Update current part for subsequent items
            # Refine title: remove "PART X" if it's already in the title to avoid redundancy.
            clean_title_part = title.upper().replace(part, '').strip(' -.')
            if clean_title_part:
                title = f"{part} - {clean_title_part}"
            else:
                title = part # Fallback to just "PART X"
        elif re.match(r'^\d+[A-C]?$', section_id):
            section_type = 'item'
            item_number = section_id
            part = current_part # Assign current part context to this item (inherited)
            # Refine title: remove "Item X" if it's already in the title
            clean_title_item = title.upper().replace(f"ITEM {item_number}", '').strip(' -.')
            if clean_title_item:
                title = f"Item {item_number} - {clean_title_item}"
            else:
                title = f"Item {item_number}" # Fallback to just "Item X"
        # For named_section (e.g., "BUSINESS" when it's not explicitly an Item number)
        elif any(keyword in title.upper() for keyword in ['BUSINESS', 'RISK FACTORS', 'LEGAL PROCEEDINGS', 'FINANCIAL STATEMENTS', 'MANAGEMENT\'S DISCUSSION', 'PROPERTIES', 'CONTROLS AND PROCEDURES']):
            section_type = 'named_section'


        final_document_sections.append(DocumentSection(
            title=title,
            content=section_content,
            section_type=section_type,
            item_number=item_number,
            part=part, # Store the part info (either detected directly or inherited)
            start_pos=start_pos,
            end_pos=end_pos
        ))

    return final_document_sections

def detect_sections_from_toc_universal(content: str) -> List[DocumentSection]:
    """
    Extract sections from table of contents - works for any SEC filing.
    This function primarily identifies section titles and item numbers from TOC,
    but does not extract their content directly.
    """
    sections = []

    if not content:
        logger.info("Empty content provided to detect_sections_from_toc_universal. Returning empty sections.")
        return sections

    # Look for table of contents patterns. Using re.escape for literal parts.
    toc_patterns = [
        re.compile(r'(?i)INDEX.*?(?=\s*--- PAGE BREAK ---)', re.DOTALL),
        re.compile(r'(?i)TABLE OF CONTENTS.*?(?=\s*--- PAGE BREAK ---)', re.DOTALL),
        re.compile(r'(?i)FORM 10-[KQ].*?INDEX.*?(?=\s*--- PAGE BREAK ---)', re.DOTALL),
        re.compile(re.escape('[TABLE_START]') + r'.*?Page.*?' + re.escape('[TABLE_END]') + r'.*?(?=\s*--- PAGE BREAK ---)', re.DOTALL),
    ]

    toc_content = ""
    for pattern in toc_patterns:
        match = pattern.search(content)
        if match:
            toc_content = match.group(0)
            break

    if not toc_content:
        logger.warning("No table of contents found in detect_sections_from_toc_universal.")
        return sections

    logger.info(f"Found table of contents ({len(toc_content)} chars)")

    # Define patterns for items/parts within the TOC
    # CORRECTED: Significant refinement here. Focused on capturing clean IDs and titles.
    # Added more specific patterns to handle the multi-column and sub-section structures.
    item_patterns = [
        # Pattern 1: Page | PART/ITEM | Item_ID. | Title | Page_Num (KO style)
        # Captures Part ID (Group 1), Part Title (Group 2), Item ID (Group 3), Item Title (Group 4)
        re.compile(r'(?i)(?:Page\s*\|\s*)?\s*(PART\s*([IVX]+)\.?(?:\s*([^\n|]+?))?\s*\|\s*)?Item\s*(\d{1,2}[A-C]?)\.?\s*\|\s*([^|]+?)(?:\s*\|\s*\d+)?', re.M),
        
        # Pattern 2: PART/ITEM | Title | Page_Num (AMZN style, or simpler)
        # Captures Item/Part ID (Group 1), Title (Group 2). Catches "Item 1. | Financial Statements | 3" or "PART I. FINANCIAL INFORMATION | 3"
        re.compile(r'(?i)(?:Item|PART)\s*(\d{1,2}[A-C]?|[IVX]+)\.?\s*\|\s*([^\n|]+?)(?:\s*\|\s*\d+)?', re.M),
        
        # Pattern 3: Standalone Item/Part ID then Title (e.g., "Item 1A. Risk Factors" or "PART II. OTHER INFORMATION")
        # Captures Item/Part ID (Group 1), Title (Group 2)
        re.compile(r'(?i)^\s*(?:Item|PART)\s*(\d{1,2}[A-C]?|[IVX]+)\.?\s*([^\n|]+)', re.M),
        
        # Pattern 4: TOC lines that are just titles, potentially indented, often sub-sections
        # These don't have Item/Part numbers explicitly. Captures Title (Group 1).
        # Filters by minimum length to avoid capturing noise like empty lines or short numbers.
        re.compile(r'^\s*([A-Z][A-Za-z0-9\s\',-]{10,})\s*(?:\|\s*\d+)?$', re.M), # Title must start with capital letter, be at least 10 chars, allow numbers/symbols
        
        # Pattern 5: Number-dot format (e.g., "1. Business") usually at start of line
        # Captures Item ID (Group 1), Title (Group 2)
        re.compile(r'^\s*(\d{1,2}[A-C]?)\.\s*([^\n|]+)', re.M),
    ]


    found_items = []
    current_part_id_context = None # To associate items with the last seen part

    if toc_content:
        for line in toc_content.split('\n'):
            line = line.strip()
            if not line:
                continue

            for pattern in item_patterns:
                match = pattern.search(line)
                if match:
                    item_id = None
                    item_title = ""
                    section_type_raw = 'unknown' # Default type

                    if pattern == item_patterns[0]: # Page | PART/ITEM | Item_ID. | Title | Page_Num
                        part_id_cand = match.group(1) if match.group(1) else None
                        item_id = match.group(3).strip() if match.group(3) else None # Item ID is group 3
                        item_title = match.group(4).strip() if match.group(4) else "" # Item Title is group 4
                        
                        if part_id_cand:
                            current_part_id_context = f"PART {part_id_cand}"
                            found_items.append((part_id_cand, match.group(2).strip(), 'part', current_part_id_context)) # Add the PART entry
                        
                        if item_id:
                            section_type_raw = 'item'
                            found_items.append((item_id, item_title, section_type_raw, current_part_id_context))
                            break # Move to next line

                    elif pattern == item_patterns[1] or pattern == item_patterns[2] or pattern == item_patterns[4]: # Patterns with ID as group 1, Title as group 2
                        item_id = match.group(1).strip()
                        item_title = match.group(2).strip() if len(match.groups()) > 1 and match.group(2) else ""

                        is_item = re.match(r'^\d+[A-C]?$', item_id, re.I)
                        is_part = re.match(r'^[IVX]+$', item_id, re.I)

                        if is_item:
                            section_type_raw = 'item'
                            found_items.append((item_id, item_title, section_type_raw, current_part_id_context))
                            break
                        elif is_part:
                            section_type_raw = 'part'
                            current_part_id_context = f"PART {item_id}"
                            found_items.append((item_id, item_title, section_type_raw, current_part_id_context))
                            break

                    elif pattern == item_patterns[3]: # Generic titles (Pattern 4 from above)
                        item_title = match.group(1).strip()
                        if item_title and len(item_title) > 10 and not re.match(r'^\d', item_title): # Ensure not just a number/symbol
                             # Assign a None ID, it's a named sub-section
                             found_items.append((None, item_title, 'named_section', current_part_id_context))
                             break # Move to next line
            
    # Refined deduplication and final DocumentSection creation
    unique_items = []
    seen_keys = set()
    
    # Process found_items to clean and add part context
    processed_items_with_parts = []
    
    # Re-apply current_part_id_context correctly after initial parsing (cleaner way)
    temp_sections = []
    temp_current_part = None
    for item_id, title_raw, section_type_raw, _ in found_items:
        if section_type_raw == 'part':
            temp_current_part = f"PART {item_id}"
            temp_sections.append({'item_id': item_id, 'title': title_raw, 'type': 'part', 'part': temp_current_part})
        elif section_type_raw == 'item':
            temp_sections.append({'item_id': item_id, 'title': title_raw, 'type': 'item', 'part': temp_current_part})
        else: # named_section or unknown type from TOC
            temp_sections.append({'item_id': item_id, 'title': title_raw, 'type': 'named_section', 'part': temp_current_part})


    # Deduplicate and create final DocumentSection objects.
    # Sort by parts and items for logical ordering.
    temp_sections.sort(key=lambda x: (x['part'] if x['part'] else '', x['item_id'] if x['item_id'] else '', x['title']))

    for item in temp_sections:
        key = (item['item_id'], item['title'], item['type'], item['part'])
        if key not in seen_keys:
            unique_items.append(DocumentSection(
                title=item['title'],
                content="",
                section_type=item['type'],
                item_number=item['item_id'] if item['type'] == 'item' else None,
                part=item['part'],
                start_pos=0,
                end_pos=0
            ))
            seen_keys.add(key)
    
    logger.info(f"Extracted {len(unique_items)} sections from table of contents:")
    for i, sec in enumerate(unique_items[:10]):
        logger.info(f"  • {sec.item_number if sec.item_number else sec.part if sec.part else 'NoID'}: {sec.title[:50]}...")

    return unique_items # Return DocumentSection objects directly

def detect_sections_robust_universal(content: str) -> List[DocumentSection]:
    """
    Universal robust section detection for all SEC filings.
    Prioritizes direct pattern matching (which handles tables well), then TOC, then page-based.
    """
    logger.info("Attempting universal SEC section detection")

    # Strategy 1: Direct pattern matching for sections (designed to work well with common SEC patterns)
    sections_strategy1 = detect_sections_universal_sec(content)

    if len(sections_strategy1) >= 3:
        logger.info(f"Universal detection successful (Strategy 1): Found {len(sections_strategy1)} sections.")
        return sections_strategy1

    # Strategy 2: Try parsing Table of Contents.
    logger.warning("Direct detection found few sections, analyzing table of contents.")
    toc_entries = detect_sections_from_toc_universal(content) # These are DocumentSections with only title/metadata, no content

    if toc_entries and len(toc_entries) >= 3: # If TOC parsing yielded a good number of entries
        logger.info(f"TOC analysis found {len(toc_entries)} potential sections. Attempting to extract content based on TOC titles.")

        combined_sections = []
        current_content_pos = 0

        # Sort TOC entries to ensure correct order for content extraction
        # This sorting is already done in detect_sections_from_toc_universal before returning.
        # So toc_entries should already be sorted.

        for i, toc_entry in enumerate(toc_entries):
            pattern_parts = []
            
            # Create highly flexible regex for matching TOC entry in main content
            # Account for variations in whitespace, periods, and potential parenthetical additions
            
            # Prioritize matching by Item/Part numbers if they exist
            if toc_entry.item_number:
                pattern_parts.append(r'Item\s*' + re.escape(toc_entry.item_number) + r'\.?\s*(?:[A-Z][a-z0-9\s,\'()-]*)*') # "Item 1." or "Item 1A" with potential title after it
            elif toc_entry.part:
                pattern_parts.append(r'PART\s*' + re.escape(toc_entry.part.replace("PART ", "")) + r'\.?(?:\s*[-–—]?\s*[A-Z][a-z0-9\s,\'()-]*)*') # "PART I." or "PART I - TITLE"
            
            # Fallback to matching the cleaned title from TOC
            if toc_entry.title:
                # Clean title for regex matching in content (remove page numbers, excess pipes, etc.)
                cleaned_title_for_regex = re.sub(r'\|\s*\d+', '', toc_entry.title).strip() # Remove "| PageNumber"
                cleaned_title_for_regex = re.sub(r'\s*\.\s*$', '', cleaned_title_for_regex).strip() # Remove trailing periods
                cleaned_title_for_regex = re.sub(r'\s+', r'\s+', cleaned_title_for_regex) # Replace multiple spaces with \s+ for flexible matching
                pattern_parts.append(re.escape(cleaned_title_for_regex)) # re.escape the cleaned title
                
            if not pattern_parts:
                logger.warning(f"No valid pattern parts for TOC entry: '{toc_entry.title}'. Skipping.")
                continue

            # Combine all potential ways to match this section's header
            # Make it look for these patterns at the beginning of a line, allowing some leading whitespace
            search_pattern = re.compile(r'(?i)^\s*(?:' + '|'.join(pattern_parts) + r')', re.M)
            
            match = search_pattern.search(content, pos=current_content_pos)

            if match:
                start_pos = match.start()
                
                next_start_pos = len(content)
                if i + 1 < len(toc_entries): # Check the next entry in the *sorted* list
                    next_toc_entry = toc_entries[i+1]
                    next_pattern_parts = []
                    if next_toc_entry.item_number:
                        next_pattern_parts.append(r'Item\s*' + re.escape(next_toc_entry.item_number) + r'\.?')
                    elif next_toc_entry.part:
                        next_pattern_parts.append(r'PART\s*' + re.escape(next_toc_entry.part.replace("PART ", "")) + r'\.?')
                    if next_toc_entry.title:
                        next_cleaned_title_for_regex = re.sub(r'\|\s*\d+', '', next_toc_entry.title).strip()
                        next_cleaned_title_for_regex = re.sub(r'\s*\.\s*$', '', next_cleaned_title_for_regex).strip()
                        next_cleaned_title_for_regex = re.sub(r'\s+', r'\s+', next_cleaned_title_for_regex)
                        next_pattern_parts.append(re.escape(next_cleaned_title_for_regex))

                    if next_pattern_parts:
                        next_pattern = re.compile(r'(?i)^\s*(?:' + '|'.join(next_pattern_parts) + r')', re.M)
                        next_match = next_pattern.search(content, pos=match.end()) # Search from end of current match
                        if next_match:
                            next_start_pos = next_match.start()
                
                section_content = content[start_pos:next_start_pos].strip()
                
                combined_sections.append(DocumentSection(
                    title=toc_entry.title,
                    content=section_content,
                    section_type=toc_entry.section_type,
                    item_number=toc_entry.item_number,
                    part=toc_entry.part,
                    start_pos=start_pos,
                    end_pos=next_start_pos
                ))
                current_content_pos = next_start_pos
            else:
                logger.warning(f"Could not find content for TOC entry: '{toc_entry.title}'. This section might be merged with previous or skipped.")

        if len(combined_sections) >= 3:
            logger.info(f"Universal detection successful (TOC-based content mapping): Found {len(combined_sections)} sections.")
            return combined_sections
        else:
            logger.warning("TOC-based content mapping yielded few sections. Falling back to page-based detection.")


    # Strategy 3: Page-based fallback (original strategy 2)
    logger.warning("Trying page-based detection as fallback.")
    sections_strategy2 = detect_sections_strategy_2(content)

    if len(sections_strategy2) >= 2:
        logger.info(f"Page-based detection successful: Found {len(sections_strategy2)} sections.")
        return sections_strategy2

    # Final fallback: return the entire document as a single section
    logger.warning("All strategies failed, creating single section.")
    return [DocumentSection(
        title="Full Document",
        content=content,
        section_type='document',
        start_pos=0,
        end_pos=len(content)
    )]

In [9]:
results_universal = test_universal_detection_fixed()
old_vs_new_sections = compare_old_vs_universal_fixed()
quick_pattern_test_fixed()

INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 19 unique sections:
INFO:__main__:  1: Item/Part 1 - Business...
INFO:__main__:  2: Item/Part 1A - Risk Factors...
INFO:__main__:  3: Item/Part 1B - Unresolved Staff Comments...
INFO:__main__:  4: Item/Part 3 - Legal Proceedings...
INFO:__main__:  5: Item/Part 4 - Mine Safety Disclosures...
INFO:__main__:  6: Item/Part 5 - Market for Registrant’s Common Equity, Related Stockholder M...
INFO:__main__:  7: Item/Part 6 - Selected Financial Data...
INFO:__main__:  8: Item/Part 7 - Management’s Discussion and Analysis of Financial Condition ...
INFO:__main__:  9: Item/Part 7A - Quantitative and Qualitative Disclosures About Market Risk...
INFO:__main__:  10: Item/Part 8 - Financial Statements and Supplementary Data...
INFO:__main__:  11: Item/Part 9 - Changes in and Disagreements with Accountants on Accounting ...
INFO:__main__:  12: Item/Part 9A - Controls and Procedures...
INFO:__main__:


🧪 Testing: processed_filings/AAPL/AAPL_10K_2020-10-30.txt


✅ Found 19 sections:

  1. Item 1 - BUSINESS

     Type: item, Length: 13,266 chars

  2. Item 1A - RISK FACTORS

     Type: item, Length: 61,136 chars

  3. Item 1B - UNRESOLVED STAFF COMMENTS

     Type: item, Length: 582 chars

  4. Item 3 - LEGAL PROCEEDINGS

     Type: item, Length: 898 chars

  5. Item 4 - MINE SAFETY DISCLOSURES

     Type: item, Length: 108 chars

  6. Item 5 - MARKET FOR REGISTRANT’S COMMON EQUITY, RELATED STOCKHOLDER MATTERS AND ISSUER PURCHASES OF EQUITY SECURITIES

     Type: item, Length: 4,182 chars

  7. Item 6 - SELECTED FINANCIAL DATA

     Type: item, Length: 1,745 chars

  8. Item 7 - MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIAL CONDITION AND RESULTS OF OPERATIONS

     Type: item, Length: 33,154 chars

  9. Item 7A - QUANTITATIVE AND QUALITATIVE DISCLOSURES ABOUT MARKET RISK

     Type: item, Length: 6,799 chars

  10. Item 8 - FINANCIAL STATEMENTS AND SUPPLEMENTARY DATA

     Type: i

INFO:__main__:Created 132 chunks for AMZN_10Q_2024-11-01.txt
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 8 unique sections:
INFO:__main__:  1: Item/Part 1 - Financial Statements (Unaudited)...
INFO:__main__:  2: Item/Part 2 - Management's Discussion and Analysis of Financial Condition ...
INFO:__main__:  3: Item/Part 3 - Quantitative and Qualitative Disclosures About Market Risk...
INFO:__main__:  4: Item/Part 4 - Controls and Procedures...
INFO:__main__:  5: Item/Part 1 - Legal Proceedings...
INFO:__main__:  6: Item/Part 1A - Risk Factors...
INFO:__main__:  7: Item/Part 2 - Unregistered Sales of Equity Securities and Use of Proceeds...
INFO:__main__:  8: Item/Part 6 - Exhibits...
INFO:__main__:Universal detection successful (Strategy 1): Found 8 sections.
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 0 unique sections:
INFO:__main__:Found table of contents (5004 chars)
I


📊 Processing Results:

  total_chunks: 132

  avg_tokens: 366.43939393939394

  min_tokens: 7

  max_tokens: 1548

  chunks_with_overlap: 81

  table_chunks: 50

  narrative_chunks: 82

  unique_sections: 1


📚 Section Distribution (sample):

  • Full Document: 20 chunks


🧪 Testing: processed_filings/KO/KO_10Q_2020-07-22.txt


✅ Found 8 sections:

  1. Item 1 - FINANCIAL STATEMENTS (UNAUDITED)

     Type: item, Length: 115,893 chars

  2. Item 2 - MANAGEMENT'S DISCUSSION AND ANALYSIS OF FINANCIAL CONDITION AND RESULTS OF OPERATIONS

     Type: item, Length: 87,923 chars

  3. Item 3 - QUANTITATIVE AND QUALITATIVE DISCLOSURES ABOUT MARKET RISK

     Type: item, Length: 207 chars

  4. Item 4 - CONTROLS AND PROCEDURES

     Type: item, Length: 1,032 chars

  5. Item 1 - LEGAL PROCEEDINGS

     Type: item, Length: 220 chars

  6. Item 1A - RISK FACTORS

     Type: item, Length: 11,661 chars

  7. Item 2 - UNREGISTERED SALES OF EQUITY SECURITIES AND USE OF PROCEEDS

     Type: item, Leng

In [11]:
def detect_sections_universal_sec(content: str) -> List[DocumentSection]:
    """
    Universal section detection for SEC filings with table-based formatting.
    Improved regex patterns for better capture of Item/Part numbers and titles.
    Ensures content for each DocumentSection is correctly sliced.
    """
    sections = []

    if not content:
        logger.info("Empty content provided to detect_sections_universal_sec. Returning empty sections.")
        return sections

    # Universal patterns for table-formatted SEC filings
    # Using raw strings `r` and explicitly handling whitespace `\s*` and literal characters.
    # Compiling patterns once for efficiency.
    patterns = [
        # Table-based ITEM patterns: e.g., "[TABLE_START] Item 1. | Business..."
        re.compile(r'(?i)\[TABLE_START\]\s*Item\s*(\d{1,2}[A-C]?)\.?\s*\|\s*([^\[]+?)\s*\[TABLE_END\]', re.DOTALL),
        re.compile(r'(?i)\[TABLE_START\]\s*Item\s*(\d{1,2}[A-C]?)\.?\s*\|\s*([^|]+)', re.DOTALL),

        # Table-based PART patterns: e.g., "[TABLE_START] PART I | FINANCIAL INFORMATION..."
        re.compile(r'(?i)\[TABLE_START\]\s*PART\s*([IVX]+)\s*\|\s*([^\[]+?)\s*\[TABLE_END\]', re.DOTALL),
        re.compile(r'(?i)\[TABLE_START\]\s*PART\s*([IVX]+)\s*\|\s*([^|]+)', re.DOTALL),
        re.compile(r'(?i)\[TABLE_START\]\s*PART\s*([IVX]+)\s*\[TABLE_END\]', re.DOTALL),

        # Standalone ITEM patterns (strong indicators, start of line): e.g., "Item 1. Business"
        re.compile(r'^\s*Item\s*(\d{1,2}[A-C]?)\.?\s*([^\n]+)', re.I | re.M),
        # Standalone ITEM patterns (pipe-separated but not necessarily table-wrapped): e.g., "Item 1. | Business"
        re.compile(r'Item\s*(\d{1,2}[A-C]?)\.?\s*\|\s*([^|]+)', re.I | re.DOTALL),

        # Standalone PART patterns (strong indicators, start of line): e.g., "PART I. FINANCIAL INFORMATION"
        re.compile(r'^\s*PART\s*([IVX]+)\.?\s*([^\n]*)', re.I | re.M),
        # Standalone PART patterns (pipe-separated): e.g., "PART I | FINANCIAL INFORMATION"
        re.compile(r'PART\s*([IVX]+)\s*\|\s*([^|]+)', re.I | re.DOTALL),

        # Number-dot format (e.g., "1. Business" not necessarily preceded by "Item", usually at start of line)
        re.compile(r'^\s*(\d{1,2}[A-C]?)\.\s+[A-Z][A-Za-z\s]{10,}', re.I | re.M),
        # Number-only pattern in tables (e.g., "[TABLE_START] 1. | Business")
        re.compile(r'(?i)\[TABLE_START\]\s*(\d{1,2}[A-C]?)\.?\s*\|\s*([^|]+)', re.I | re.DOTALL),

        # Generic Section Titles that often appear as headers (e.g., "BUSINESS", "RISK FACTORS")
        re.compile(r'^\s*(BUSINESS|RISK FACTORS|LEGAL PROCEEDINGS|FINANCIAL STATEMENTS|MANAGEMENT\'S DISCUSSION AND ANALYSIS OF FINANCIAL CONDITION AND RESULTS OF OPERATIONS|PROPERTIES|CONTROLS AND PROCEDURES)\s*$', re.I | re.M)
    ]

    all_matches = []

    for pattern_idx, pattern in enumerate(patterns):
        for match in pattern.finditer(content):
            # Determine content boundaries for the "line" containing the match
            line_start = content.rfind('\n', 0, match.start()) + 1
            line_end = content.find('\n', match.end())
            if line_end == -1:
                line_end = len(content)

            full_line = content[line_start:line_end].strip()

            # Filter out obvious false positives (e.g., content that looks like a header but isn't)
            if (len(full_line) > 400 or  # Too long to be a header
                len(full_line) < 3 or    # Too short (e.g., just "1.")
                ('TABLE' in full_line.upper() and ('START' in full_line.upper() or 'END' in full_line.upper())) or # Exclude table markers if not part of a valid section header
                full_line.count(' ') > 20):  # Too many words, likely not a header
                continue

            # Heuristic to filter out TOC entries that might match general patterns
            if any(toc_indicator in full_line.lower() for toc_indicator in ['table of contents', 'index']):
                continue
            
            section_id = None
            section_title = full_line # Default to full line if specific extraction fails

            groups = match.groups()
            if groups:
                potential_id = groups[0].strip()
                # Determine if the first captured group is a valid Item/Part ID
                is_item_id = re.match(r'^\d+[A-C]?$', potential_id, re.I)
                is_part_id = re.match(r'^[IVX]+$', potential_id, re.I)

                if is_item_id or is_part_id:
                    section_id = potential_id
                    if len(groups) > 1 and groups[1]: # If a title group was also captured
                        section_title = groups[1].strip()
                        # Clean up title: remove trailing table markers like "[TABLE_END]" if they were captured
                        section_title = re.sub(r'\[TABLE_END\]\s*.*', '', section_title, flags=re.I).strip()
                        section_title = section_title.replace('|', '').strip() # Remove pipe characters
                    else: # No explicit title captured by a group
                        # Try to extract a clean title from the remainder of the line after the ID
                        remaining_line_after_id = full_line[match.end() - line_start:].strip()
                        clean_line = re.sub(r'^\s*\.?\s*[-–—]?\s*', '', remaining_line_after_id).strip()
                        if clean_line and len(clean_line) < 200: # Ensure extracted title isn't too long
                            section_title = clean_line
                        else:
                             section_title = full_line # Fallback to full line if cleaning is problematic
                else: # First captured group was not a standard Item/Part ID, treat as part of title
                    section_title = full_line
                    # For generic named sections (e.g., "BUSINESS"), assign a canonical ID if not part of an Item/Part already
                    if 'BUSINESS' in full_line.upper() and not is_item_id and not is_part_id: section_id = '1'
                    elif 'RISK FACTORS' in full_line.upper() and not is_item_id and not is_part_id: section_id = '1A'
                    # Add other named section mappings if needed.

            # Store the actual start and end positions of the matched content within the document
            all_matches.append({
                'start_pos': match.start(), # Use match.start() for the *exact* start of the regex match
                'end_pos': match.end(),     # Use match.end() for the *exact* end of the regex match
                'full_line': full_line, # Keep for debugging/context
                'section_id': section_id if section_id else 'unknown',
                'section_title': section_title,
                'pattern_idx': pattern_idx,
                'match_start': match.start()
            })

    # Sort matches primarily by start_pos, secondarily by pattern_idx (to prefer more specific patterns early in the list)
    all_matches.sort(key=lambda x: (x['start_pos'], x['pattern_idx']))

    # Filter duplicate/overlapping matches. Prioritize more specific patterns (lower pattern_idx).
    final_matches = []
    if all_matches:
        final_matches.append(all_matches[0])
        for i in range(1, len(all_matches)):
            current_match = all_matches[i]
            last_added_match = final_matches[-1]

            # If current match starts very close to the last added match,
            # consider if it's a duplicate or a better alternative.
            if current_match['start_pos'] - last_added_match['start_pos'] < 100: # Within 100 chars
                # Prefer matches with a specific Item/Part ID over 'unknown' or less specific types
                if current_match['section_id'] != 'unknown' and last_added_match['section_id'] == 'unknown':
                    final_matches[-1] = current_match
                # If both are specific, prefer the one matched by a higher-priority pattern (lower index means earlier in list)
                elif current_match['section_id'] != 'unknown' and last_added_match['section_id'] != 'unknown' and current_match['pattern_idx'] < last_added_match['pattern_idx']:
                    final_matches[-1] = current_match
                # If they have the same ID but the new match offers a cleaner/more robust title
                elif current_match['section_id'] == last_added_match['section_id'] and len(current_match['section_title']) < len(last_added_match['section_title']) * 0.8: # Heuristic for "cleaner"
                     final_matches[-1] = current_match
                # Otherwise, if it's too close and not a better candidate, skip as duplicate
            else:
                final_matches.append(current_match) # Add if sufficiently far apart

    logger.info(f"🔍 Universal SEC detection found {len(final_matches)} unique sections:")
    for i, match in enumerate(final_matches[:15]):
        logger.info(f"  {i+1}: Item/Part {match['section_id']} - {match['section_title'][:60]}...")

    # Convert to DocumentSection objects
    final_document_sections = []
    current_part = None # Track current part for 10Q item context

    for i, match in enumerate(final_matches):
        start_pos = match['start_pos']
        # The content for this section goes from its start_pos to the start_pos of the *next* matched section
        # or to the end of the entire document if it's the last section.
        end_pos = final_matches[i + 1]['start_pos'] if i + 1 < len(final_matches) else len(content)

        # CRITICAL FIX: Ensure section_content is correctly sliced from the original content
        section_content = content[start_pos:end_pos].strip()

        section_id = match['section_id'].upper()
        title = match['section_title']

        section_type = 'content' # Default type
        item_number = None
        part = None

        if re.match(r'^[IVX]+$', section_id):
            section_type = 'part'
            part = f"PART {section_id}"
            current_part = part # Update current part for subsequent items
            # Refine title: remove "PART X" if it's already in the title to avoid redundancy.
            clean_title_part = title.upper().replace(part, '').strip(' -.')
            if clean_title_part:
                title = f"{part} - {clean_title_part}"
            else:
                title = part # Fallback to just "PART X"
        elif re.match(r'^\d+[A-C]?$', section_id):
            section_type = 'item'
            item_number = section_id
            part = current_part # Assign current part context to this item (inherited from last PART)
            # Refine title: remove "Item X" if it's already in the title
            clean_title_item = title.upper().replace(f"ITEM {item_number}", '').strip(' -.')
            if clean_title_item:
                title = f"Item {item_number} - {clean_title_item}"
            else:
                title = f"Item {item_number}" # Fallback to just "Item X"
        # For named_section (e.g., "BUSINESS" when it's not explicitly an Item number)
        elif any(keyword in title.upper() for keyword in ['BUSINESS', 'RISK FACTORS', 'LEGAL PROCEEDINGS', 'FINANCIAL STATEMENTS', 'MANAGEMENT\'S DISCUSSION', 'PROPERTIES', 'CONTROLS AND PROCEDURES']):
            section_type = 'named_section'


        final_document_sections.append(DocumentSection(
            title=title,
            content=section_content, # Pass the correctly sliced content
            section_type=section_type,
            item_number=item_number,
            part=part, # Store the part info (either detected directly or inherited)
            start_pos=start_pos,
            end_pos=end_pos
        ))

    return final_document_sections

def detect_sections_from_toc_universal(content: str) -> List[DocumentSection]:
    """
    Extract sections from table of contents - works for any SEC filing.
    This function primarily identifies section titles and item numbers from TOC,
    but does not extract their content directly.
    """
    sections = []

    if not content:
        logger.info("Empty content provided to detect_sections_from_toc_universal. Returning empty sections.")
        return sections

    # Look for table of contents patterns. Using re.escape for literal parts.
    toc_patterns = [
        re.compile(r'(?i)INDEX.*?(?=\s*--- PAGE BREAK ---)', re.DOTALL),
        re.compile(r'(?i)TABLE OF CONTENTS.*?(?=\s*--- PAGE BREAK ---)', re.DOTALL),
        re.compile(r'(?i)FORM 10-[KQ].*?INDEX.*?(?=\s*--- PAGE BREAK ---)', re.DOTALL),
        re.compile(re.escape('[TABLE_START]') + r'.*?Page.*?' + re.escape('[TABLE_END]') + r'.*?(?=\s*--- PAGE BREAK ---)', re.DOTALL),
    ]

    toc_content = ""
    for pattern in toc_patterns:
        match = pattern.search(content)
        if match:
            toc_content = match.group(0)
            break

    if not toc_content:
        logger.warning("No table of contents found in detect_sections_from_toc_universal.")
        return sections

    logger.info(f"Found table of contents ({len(toc_content)} chars)")

    # Define patterns for items/parts within the TOC.
    # CORRECTED: Significant refinement here. Focused on capturing clean IDs and titles.
    # Added more specific patterns to handle multi-column and sub-section structures.
    # Added stricter checks to avoid capturing noise.
    item_patterns = [
        # Pattern 1: Multi-column TOC entry with PART, Item, and Title (e.g., KO 10-Q)
        # Captures: (Optional Page Num) | PART ID | PART Title (Optional) | Item ID | Item Title (Optional) | Page Num
        # Group 1: PART ID (e.g., 'I'), Group 2: PART Title, Group 3: Item ID, Group 4: Item Title
        re.compile(r'(?i)(?:Page\s*\|\s*)?\s*(?:PART\s*([IVX]+)\.?(?:\s*([^\n|]+?))?\s*\|\s*)?Item\s*(\d{1,2}[A-C]?)\.?\s*\|\s*([^|]+?)(?:\s*\|\s*\d+)?', re.M),
        
        # Pattern 2: Simpler Item/Part line with Title (e.g., "Item 1. | Financial Statements | 3")
        # Captures: Item/PART ID (Group 1), Title (Group 2)
        re.compile(r'(?i)(?:Item|PART)\s*(\d{1,2}[A-C]?|[IVX]+)\.?\s*\|\s*([^\n|]+?)(?:\s*\|\s*\d+)?', re.M),
        
        # Pattern 3: Standalone Item/Part line with Title (e.g., "Item 1A. Risk Factors" or "PART II. OTHER INFORMATION")
        # Captures: Item/PART ID (Group 1), Title (Group 2)
        re.compile(r'(?i)^\s*(?:Item|PART)\s*(\d{1,2}[A-C]?|[IVX]+)\.?\s*([^\n|]+)', re.M),
        
        # Pattern 4: Generic TOC titles, often sub-sections or long descriptions. Must be long enough to avoid noise.
        # Captures: Title (Group 1). Requires a reasonable length and start with a capital letter.
        re.compile(r'^\s*([A-Z][A-Za-z0-9\s\',&\(\)-]{15,})\s*(?:\|\s*\d+)?$', re.M), # Min 15 chars, allow more symbols

        # Pattern 5: Simple "PART X" line (e.g., "PART I")
        re.compile(r'(?i)^\s*PART\s*([IVX]+)\s*$', re.M),
        
        # Pattern 6: Number-dot format (e.g., "1. Business") usually at start of line
        # Captures Item ID (Group 1), Title (Group 2)
        re.compile(r'^\s*(\d{1,2}[A-C]?)\.\s*([^\n|]+)', re.M),
    ]


    found_items = []
    current_part_id_context = None # To associate items with the last seen part

    if toc_content:
        for line in toc_content.split('\n'):
            line = line.strip()
            if not line:
                continue
            
            # Skip lines that are likely just TOC headers/footers or page numbers
            if any(kw in line.lower() for kw in ['page', 'item', 'part', 'description']) and len(line) < 20: # Short headers/footers
                continue
            if re.match(r'^\d+$', line.strip()): # Just a page number
                continue
            if re.match(r'^\s*(\d{1,2}[A-C]?)\s*$', line.strip()): # Just "1" or "1A"
                continue

            for pattern in item_patterns:
                match = pattern.search(line)
                if match:
                    item_id = None
                    item_title = ""
                    section_type_raw = 'unknown'

                    if pattern == item_patterns[0]: # Pattern 1: Complex multi-column (Page | PART/ITEM | Item_ID. | Title)
                        part_id_cand = match.group(2) if match.group(2) else None # Group 2 for PART ID
                        item_id = match.group(3).strip() if match.group(3) else None # Group 3 for Item ID
                        item_title = match.group(4).strip() if match.group(4) else "" # Group 4 for Item Title
                        
                        if part_id_cand:
                            current_part_id_context = f"PART {part_id_cand}"
                            found_items.append((part_id_cand, match.group(1).strip() if match.group(1) else f"PART {part_id_cand}", 'part', current_part_id_context)) # Add the PART entry
                        
                        if item_id:
                            section_type_raw = 'item'
                            found_items.append((item_id, item_title, section_type_raw, current_part_id_context))
                            break # Move to next line (matched)

                    elif pattern == item_patterns[1] or pattern == item_patterns[2] or pattern == item_patterns[5]: # Item/PART | Title | Page, or Item/PART. Title, or number. Title
                        item_id = match.group(1).strip() if match.group(1) else None
                        item_title = match.group(2).strip() if len(match.groups()) > 1 and match.group(2) else ""

                        is_item = re.match(r'^\d+[A-C]?$', item_id, re.I)
                        is_part = re.match(r'^[IVX]+$', item_id, re.I)

                        if is_item:
                            section_type_raw = 'item'
                            found_items.append((item_id, item_title, section_type_raw, current_part_id_context))
                            break
                        elif is_part:
                            section_type_raw = 'part'
                            current_part_id_context = f"PART {item_id}"
                            found_items.append((item_id, item_title, section_type_raw, current_part_id_context))
                            break
                    
                    elif pattern == item_patterns[3]: # Generic titles (Pattern 4 from above)
                        item_title = match.group(1).strip()
                        # Add a sanity check for extracted titles (e.g., not just numbers or very short)
                        if item_title and len(item_title) > 10 and not re.match(r'^\d+(\.\d+)?$', item_title.replace('.', '').strip()): # Not purely numeric
                             found_items.append((None, item_title, 'named_section', current_part_id_context))
                             break
                    
                    elif pattern == item_patterns[4]: # Simple "PART X" line (Pattern 5 from above)
                        item_id = match.group(1).strip()
                        current_part_id_context = f"PART {item_id}"
                        found_items.append((item_id, f"PART {item_id}", 'part', current_part_id_context))
                        break

    # Deduplicate and create final DocumentSection objects.
    unique_items = []
    seen_keys = set()
    
    # Process found_items to ensure correct part context is applied and clean titles
    final_processed_items = []
    for item_data in found_items:
        item_id, title_raw, section_type_raw, part_context = item_data
        
        # Clean title to remove trailing page numbers or extraneous characters often seen in TOCs
        cleaned_title = re.sub(r'\s*\|\s*\d+\s*$', '', title_raw).strip() # Remove "| PageNum" from end
        cleaned_title = re.sub(r'\s*\.\s*$', '', cleaned_title).strip() # Remove trailing periods
        cleaned_title = re.sub(r'\[TABLE_END\]\s*.*', '', cleaned_title, flags=re.I).strip() # Remove table end markers
        cleaned_title = re.sub(r'\s+', ' ', cleaned_title).strip() # Normalize internal whitespace

        # Filter out titles that are just page numbers or very short/uninformative
        if not cleaned_title or len(cleaned_title) < 5 or re.match(r'^\d+$', cleaned_title):
            continue

        # Create a unique key for deduplication. Use a combination of ID, clean title, type, and part context.
        key = (item_id, cleaned_title, section_type_raw, part_context)
        if key not in seen_keys:
            # Create a DocumentSection. content is left empty to be filled later by content mapping.
            unique_items.append(DocumentSection(
                title=cleaned_title,
                content="",
                section_type=section_type_raw,
                item_number=item_id if section_type_raw == 'item' else None,
                part=part_context if section_type_raw == 'item' else (f"PART {item_id}" if section_type_raw == 'part' else None), # Store the actual part string
                start_pos=0,
                end_pos=0
            ))
            seen_keys.add(key)
    
    # Sort the final unique_items list. This is crucial for content mapping later.
    # Sort by part (alphabetical, which works for Roman numerals if prefixed "PART "), then by item number (numeric), then by title.
    def sort_key_for_doc_section(doc_sec):
        part_sort_val = doc_sec.part if doc_sec.part else ''
        item_num_sort_val = ''
        if doc_sec.item_number:
            match_num_alpha = re.match(r'(\d+)([A-C]?)', doc_sec.item_number)
            if match_num_alpha:
                item_num_sort_val = (int(match_num_alpha.group(1)), match_num_alpha.group(2))
            else: # Fallback for non-standard item numbers
                item_num_sort_val = (float('inf'), doc_sec.item_number)
        return (part_sort_val, item_num_sort_val, doc_sec.title)

    unique_items.sort(key=sort_key_for_doc_section)

    logger.info(f"Extracted {len(unique_items)} sections from table of contents:")
    for i, sec in enumerate(unique_items[:15]): # Show more for debugging TOC
        logger.info(f"  • ID: {sec.item_number if sec.item_number else sec.part if sec.part else 'None'}, Type: {sec.section_type}, Title: {sec.title[:60]}...")

    return unique_items


def detect_sections_robust_universal(content: str) -> List[DocumentSection]:
    """
    Universal robust section detection for all SEC filings.
    Prioritizes direct pattern matching (which handles tables well), then TOC, then page-based.
    """
    logger.info("Attempting universal SEC section detection")

    # Strategy 1: Direct pattern matching for sections (designed to work well with common SEC patterns)
    sections_strategy1 = detect_sections_universal_sec(content)

    if len(sections_strategy1) >= 3:
        logger.info(f"Universal detection successful (Strategy 1): Found {len(sections_strategy1)} sections.")
        return sections_strategy1

    # Strategy 2: Try parsing Table of Contents.
    logger.warning("Direct detection found few sections, analyzing table of contents.")
    toc_entries = detect_sections_from_toc_universal(content) # These are DocumentSections with only title/metadata, no content

    if toc_entries and len(toc_entries) >= 3: # If TOC parsing yielded a good number of entries
        logger.info(f"TOC analysis found {len(toc_entries)} potential sections. Attempting to extract content based on TOC titles.")

        combined_sections = []
        current_content_pos = 0

        # TOC entries are already sorted by `detect_sections_from_toc_universal`

        for i, toc_entry in enumerate(toc_entries):
            pattern_parts = []
            
            # Create highly flexible regex for matching TOC entry in main content
            # Account for variations in whitespace, periods, and potential parenthetical additions
            
            # Prioritize matching by Item/Part numbers if they exist
            if toc_entry.item_number:
                # Be flexible: "Item 1.", "Item 1A", "ITEM 1" etc.
                pattern_parts.append(r'Item\s*' + re.escape(toc_entry.item_number) + r'\.?')
            if toc_entry.part:
                # Be flexible: "PART I", "PART II." etc.
                pattern_parts.append(r'PART\s*' + re.escape(toc_entry.part.replace("PART ", "")) + r'\.?')
            
            # Fallback to matching the full cleaned title from TOC
            if toc_entry.title:
                # Clean title for regex matching in content (remove page numbers, excess pipes, etc.)
                cleaned_title_for_regex = re.sub(r'\|\s*\d+', '', toc_entry.title).strip()
                cleaned_title_for_regex = re.sub(r'\s*\.\s*$', '', cleaned_title_for_regex).strip()
                cleaned_title_for_regex = re.sub(r'\s+-\s+', r'\s*[-–—]?\s*', cleaned_title_for_regex) # Handle hyphens in titles
                cleaned_title_for_regex = re.sub(r'\s+', r'\s+', cleaned_title_for_regex) # Replace multiple spaces with \s+
                
                # Add word boundaries (\b) to prevent partial word matches, but be careful with punctuation.
                # A balance is needed here. For now, rely on careful stripping and flexible whitespace.
                pattern_parts.append(re.escape(cleaned_title_for_regex))
                
            if not pattern_parts:
                logger.warning(f"No valid pattern parts for TOC entry: '{toc_entry.title}'. Skipping.")
                continue

            # Combine all potential ways to match this section's header
            # Match at the beginning of a line, allowing leading whitespace.
            # Use non-capturing groups (?:...) where applicable.
            search_pattern = re.compile(r'(?i)^\s*(?:' + '|'.join(pattern_parts) + r')', re.M)
            
            match = search_pattern.search(content, pos=current_content_pos)

            if match:
                start_pos = match.start()
                
                next_start_pos = len(content)
                if i + 1 < len(toc_entries): # Check the next entry in the *sorted* list
                    next_toc_entry = toc_entries[i+1]
                    next_pattern_parts = []
                    if next_toc_entry.item_number:
                        next_pattern_parts.append(r'Item\s*' + re.escape(next_toc_entry.item_number) + r'\.?')
                    elif next_toc_entry.part:
                        next_pattern_parts.append(r'PART\s*' + re.escape(next_toc_entry.part.replace("PART ", "")) + r'\.?')
                    if next_toc_entry.title:
                        next_cleaned_title_for_regex = re.sub(r'\|\s*\d+', '', next_toc_entry.title).strip()
                        next_cleaned_title_for_regex = re.sub(r'\s*\.\s*$', '', next_cleaned_title_for_regex).strip()
                        next_cleaned_title_for_regex = re.sub(r'\s+-\s+', r'\s*[-–—]?\s*', next_cleaned_title_for_regex)
                        next_cleaned_title_for_regex = re.sub(r'\s+', r'\s+', next_cleaned_title_for_regex)
                        next_pattern_parts.append(re.escape(next_cleaned_title_for_regex))

                    if next_pattern_parts:
                        next_pattern = re.compile(r'(?i)^\s*(?:' + '|'.join(next_pattern_parts) + r')', re.M)
                        next_match = next_pattern.search(content, pos=match.end()) # Search from end of current match
                        if next_match:
                            next_start_pos = next_match.start()
                
                section_content = content[start_pos:next_start_pos].strip()
                
                combined_sections.append(DocumentSection(
                    title=toc_entry.title,
                    content=section_content,
                    section_type=toc_entry.section_type,
                    item_number=toc_entry.item_number,
                    part=toc_entry.part,
                    start_pos=start_pos,
                    end_pos=next_start_pos
                ))
                current_content_pos = next_start_pos
            else:
                logger.warning(f"Could not find content for TOC entry: '{toc_entry.title}'. This section might be merged with previous or skipped.")

        if len(combined_sections) >= 3: # Only consider TOC mapping successful if it yields a good number of sections
            logger.info(f"Universal detection successful (TOC-based content mapping): Found {len(combined_sections)} sections.")
            return combined_sections
        else:
            logger.warning("TOC-based content mapping yielded few sections. Falling back to page-based detection.")


    # Strategy 3: Page-based fallback (original strategy 2)
    logger.warning("Trying page-based detection as fallback.")
    sections_strategy2 = detect_sections_strategy_2(content)

    if len(sections_strategy2) >= 2:
        logger.info(f"Page-based detection successful: Found {len(sections_strategy2)} sections.")
        return sections_strategy2

    # Final fallback: return the entire document as a single section
    logger.warning("All strategies failed, creating single section.")
    return [DocumentSection(
        title="Full Document",
        content=content,
        section_type='document',
        start_pos=0,
        end_pos=len(content)
    )]

In [12]:
results_universal = test_universal_detection_fixed()
old_vs_new_sections = compare_old_vs_universal_fixed()
quick_pattern_test_fixed()

INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 19 unique sections:
INFO:__main__:  1: Item/Part 1 - Business...
INFO:__main__:  2: Item/Part 1A - Risk Factors...
INFO:__main__:  3: Item/Part 1B - Unresolved Staff Comments...
INFO:__main__:  4: Item/Part 3 - Legal Proceedings...
INFO:__main__:  5: Item/Part 4 - Mine Safety Disclosures...
INFO:__main__:  6: Item/Part 5 - Market for Registrant’s Common Equity, Related Stockholder M...
INFO:__main__:  7: Item/Part 6 - Selected Financial Data...
INFO:__main__:  8: Item/Part 7 - Management’s Discussion and Analysis of Financial Condition ...
INFO:__main__:  9: Item/Part 7A - Quantitative and Qualitative Disclosures About Market Risk...
INFO:__main__:  10: Item/Part 8 - Financial Statements and Supplementary Data...
INFO:__main__:  11: Item/Part 9 - Changes in and Disagreements with Accountants on Accounting ...
INFO:__main__:  12: Item/Part 9A - Controls and Procedures...
INFO:__main__:


🧪 Testing: processed_filings/AAPL/AAPL_10K_2020-10-30.txt


✅ Found 19 sections:

  1. Item 1 - BUSINESS

     Type: item, Length: 13,266 chars

  2. Item 1A - RISK FACTORS

     Type: item, Length: 61,136 chars

  3. Item 1B - UNRESOLVED STAFF COMMENTS

     Type: item, Length: 582 chars

  4. Item 3 - LEGAL PROCEEDINGS

     Type: item, Length: 898 chars

  5. Item 4 - MINE SAFETY DISCLOSURES

     Type: item, Length: 108 chars

  6. Item 5 - MARKET FOR REGISTRANT’S COMMON EQUITY, RELATED STOCKHOLDER MATTERS AND ISSUER PURCHASES OF EQUITY SECURITIES

     Type: item, Length: 4,182 chars

  7. Item 6 - SELECTED FINANCIAL DATA

     Type: item, Length: 1,745 chars

  8. Item 7 - MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIAL CONDITION AND RESULTS OF OPERATIONS

     Type: item, Length: 33,154 chars

  9. Item 7A - QUANTITATIVE AND QUALITATIVE DISCLOSURES ABOUT MARKET RISK

     Type: item, Length: 6,799 chars

  10. Item 8 - FINANCIAL STATEMENTS AND SUPPLEMENTARY DATA

     Type: i

INFO:__main__:Created 210 chunks for AMZN_10K_2023-02-03.txt
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 11 unique sections:
INFO:__main__:  1: Item/Part 1 - Financial Statements...
INFO:__main__:  2: Item/Part unknown - Legal Proceedings...
INFO:__main__:  3: Item/Part 2 - Management’s Discussion and Analysis of Financial Condition ...
INFO:__main__:  4: Item/Part 3 - Quantitative and Qualitative Disclosures About Market Risk...
INFO:__main__:  5: Item/Part 4 - Controls and Procedures...
INFO:__main__:  6: Item/Part 1 - Legal Proceedings...
INFO:__main__:  7: Item/Part 1A - Risk Factors...
INFO:__main__:  8: Item/Part 2 - Unregistered Sales of Equity Securities and Use of Proceeds...
INFO:__main__:  9: Item/Part 3 - Defaults Upon Senior Securities...
INFO:__main__:  10: Item/Part 5 - Other Information...
INFO:__main__:  11: Item/Part 6 - Exhibits...
INFO:__main__:Universal detection successful (Strategy 1): Found 11 sections.



📊 Processing Results:

  total_chunks: 210

  avg_tokens: 332.1666666666667

  min_tokens: 6

  max_tokens: 1157

  chunks_with_overlap: 119

  table_chunks: 90

  narrative_chunks: 120

  unique_sections: 1


📚 Section Distribution (sample):

  • Full Document: 20 chunks


🧪 Testing: processed_filings/AMZN/AMZN_10Q_2024-11-01.txt


✅ Found 11 sections:

  1. Item 1 - FINANCIAL STATEMENTS

     Type: item, Length: 34,940 chars

  2. Legal Proceedings

     Type: named_section, Length: 32,116 chars

  3. Item 2 - MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIAL CONDITION AND RESULTS OF OPERATIONS

     Type: item, Length: 45,107 chars

  4. Item 3 - QUANTITATIVE AND QUALITATIVE DISCLOSURES ABOUT MARKET RISK

     Type: item, Length: 4,405 chars

  5. Item 4 - CONTROLS AND PROCEDURES

     Type: item, Length: 2,104 chars

  6. Item 1 - LEGAL PROCEEDINGS

     Type: item, Length: 162 chars

  7. Item 1A - RISK FACTORS

     Type: item, Length: 59,433 chars

  8. Item 2 - UNREGISTERED SA

In [14]:
def detect_sections_universal_sec(content: str) -> List[DocumentSection]:
    """
    Universal section detection for SEC filings with table-based formatting.
    Improved regex patterns for better capture of Item/Part numbers and titles.
    Ensures content for each DocumentSection is correctly sliced.
    """
    sections = []

    if not content:
        logger.info("Empty content provided to detect_sections_universal_sec. Returning empty sections.")
        return sections

    # Universal patterns for table-formatted SEC filings
    # Using raw strings `r` and explicitly handling whitespace `\s*` and literal characters.
    # Compiling patterns once for efficiency.
    patterns = [
        # Table-based ITEM patterns: e.g., "[TABLE_START] Item 1. | Business..."
        re.compile(r'(?i)\[TABLE_START\]\s*Item\s*(\d{1,2}[A-C]?)\.?\s*\|\s*([^\[]+?)\s*\[TABLE_END\]', re.DOTALL),
        re.compile(r'(?i)\[TABLE_START\]\s*Item\s*(\d{1,2}[A-C]?)\.?\s*\|\s*([^|]+)', re.DOTALL),

        # Table-based PART patterns: e.g., "[TABLE_START] PART I | FINANCIAL INFORMATION..."
        re.compile(r'(?i)\[TABLE_START\]\s*PART\s*([IVX]+)\s*\|\s*([^\[]+?)\s*\[TABLE_END\]', re.DOTALL),
        re.compile(r'(?i)\[TABLE_START\]\s*PART\s*([IVX]+)\s*\|\s*([^|]+)', re.DOTALL),
        re.compile(r'(?i)\[TABLE_START\]\s*PART\s*([IVX]+)\s*\[TABLE_END\]', re.DOTALL),

        # Standalone ITEM patterns (strong indicators, start of line): e.g., "Item 1. Business"
        re.compile(r'^\s*Item\s*(\d{1,2}[A-C]?)\.?\s*([^\n]+)', re.I | re.M),
        # Standalone ITEM patterns (pipe-separated but not necessarily table-wrapped): e.g., "Item 1. | Business"
        re.compile(r'Item\s*(\d{1,2}[A-C]?)\.?\s*\|\s*([^|]+)', re.I | re.DOTALL),

        # Standalone PART patterns (strong indicators, start of line): e.g., "PART I. FINANCIAL INFORMATION"
        re.compile(r'^\s*PART\s*([IVX]+)\.?\s*([^\n]*)', re.I | re.M),
        # Standalone PART patterns (pipe-separated): e.g., "PART I | FINANCIAL INFORMATION"
        re.compile(r'PART\s*([IVX]+)\s*\|\s*([^|]+)', re.I | re.DOTALL),

        # Number-dot format (e.g., "1. Business" not necessarily preceded by "Item", usually at start of line)
        re.compile(r'^\s*(\d{1,2}[A-C]?)\.\s+[A-Z][A-Za-z\s]{10,}', re.I | re.M),
        # Number-only pattern in tables (e.g., "[TABLE_START] 1. | Business")
        re.compile(r'(?i)\[TABLE_START\]\s*(\d{1,2}[A-C]?)\.?\s*\|\s*([^|]+)', re.I | re.DOTALL),

        # Generic Section Titles that often appear as headers (e.g., "BUSINESS", "RISK FACTORS")
        re.compile(r'^\s*(BUSINESS|RISK FACTORS|LEGAL PROCEEDINGS|FINANCIAL STATEMENTS|MANAGEMENT\'S DISCUSSION AND ANALYSIS OF FINANCIAL CONDITION AND RESULTS OF OPERATIONS|PROPERTIES|CONTROLS AND PROCEDURES)\s*$', re.I | re.M)
    ]

    all_matches = []

    for pattern_idx, pattern in enumerate(patterns):
        for match in pattern.finditer(content):
            # Determine content boundaries for the "line" containing the match
            line_start = content.rfind('\n', 0, match.start()) + 1
            line_end = content.find('\n', match.end())
            if line_end == -1:
                line_end = len(content)

            full_line = content[line_start:line_end].strip()

            # Filter out obvious false positives (e.g., content that looks like a header but isn't)
            if (len(full_line) > 400 or  # Too long to be a header
                len(full_line) < 3 or    # Too short (e.g., just "1.")
                ('TABLE' in full_line.upper() and ('START' in full_line.upper() or 'END' in full_line.upper())) or # Exclude table markers if not part of a valid section header
                full_line.count(' ') > 20):  # Too many words, likely not a header
                continue

            # Heuristic to filter out TOC entries that might match general patterns
            if any(toc_indicator in full_line.lower() for toc_indicator in ['table of contents', 'index']):
                continue
            
            section_id = None
            section_title = full_line # Default to full line if specific extraction fails

            groups = match.groups()
            if groups:
                potential_id = groups[0].strip()
                # Determine if the first captured group is a valid Item/Part ID
                is_item_id = re.match(r'^\d+[A-C]?$', potential_id, re.I)
                is_part_id = re.match(r'^[IVX]+$', potential_id, re.I)

                if is_item_id or is_part_id:
                    section_id = potential_id
                    if len(groups) > 1 and groups[1]: # If a title group was also captured
                        section_title = groups[1].strip()
                        # Clean up title: remove trailing table markers like "[TABLE_END]" if they were captured
                        section_title = re.sub(r'\[TABLE_END\]\s*.*', '', section_title, flags=re.I).strip()
                        section_title = section_title.replace('|', '').strip() # Remove pipe characters
                    else: # No explicit title captured by a group
                        # Try to extract a clean title from the remainder of the line after the ID
                        remaining_line_after_id = full_line[match.end() - line_start:].strip()
                        clean_line = re.sub(r'^\s*\.?\s*[-–—]?\s*', '', remaining_line_after_id).strip()
                        if clean_line and len(clean_line) < 200: # Ensure extracted title isn't too long
                            section_title = clean_line
                        else:
                             section_title = full_line # Fallback to full line if cleaning is problematic
                else: # First captured group was not a standard Item/Part ID, treat as part of title
                    section_title = full_line
                    # For generic named sections (e.g., "BUSINESS"), assign a canonical ID if not part of an Item/Part already
                    if 'BUSINESS' in full_line.upper() and not is_item_id and not is_part_id: section_id = '1'
                    elif 'RISK FACTORS' in full_line.upper() and not is_item_id and not is_part_id: section_id = '1A'
                    # Add other named section mappings if needed.

            # Store the actual start and end positions of the matched content within the document
            all_matches.append({
                'start_pos': match.start(), # Use match.start() for the *exact* start of the regex match
                'end_pos': match.end(),     # Use match.end() for the *exact* end of the regex match
                'full_line': full_line, # Keep for debugging/context
                'section_id': section_id if section_id else 'unknown',
                'section_title': section_title,
                'pattern_idx': pattern_idx,
                'match_start': match.start()
            })

    # Sort matches primarily by start_pos, secondarily by pattern_idx (to prefer more specific patterns early in the list)
    all_matches.sort(key=lambda x: (x['start_pos'], x['pattern_idx']))

    # Filter duplicate/overlapping matches. Prioritize more specific patterns (lower pattern_idx).
    final_matches = []
    if all_matches:
        final_matches.append(all_matches[0])
        for i in range(1, len(all_matches)):
            current_match = all_matches[i]
            last_added_match = final_matches[-1]

            # If current match starts very close to the last added match,
            # consider if it's a duplicate or a better alternative.
            if current_match['start_pos'] - last_added_match['start_pos'] < 100: # Within 100 chars
                # Prefer matches with a specific Item/Part ID over 'unknown' or less specific types
                if current_match['section_id'] != 'unknown' and last_added_match['section_id'] == 'unknown':
                    final_matches[-1] = current_match
                # If both are specific, prefer the one matched by a higher-priority pattern (lower index means earlier in list)
                elif current_match['section_id'] != 'unknown' and last_added_match['section_id'] != 'unknown' and current_match['pattern_idx'] < last_added_match['pattern_idx']:
                    final_matches[-1] = current_match
                # If they have the same ID but the new match offers a cleaner/more robust title
                elif current_match['section_id'] == last_added_match['section_id'] and len(current_match['section_title']) < len(last_added_match['section_title']) * 0.8: # Heuristic for "cleaner"
                     final_matches[-1] = current_match
                # Otherwise, if it's too close and not a better candidate, skip as duplicate
            else:
                final_matches.append(current_match) # Add if sufficiently far apart

    logger.info(f"🔍 Universal SEC detection found {len(final_matches)} unique sections:")
    for i, match in enumerate(final_matches[:15]):
        logger.info(f"  {i+1}: Item/Part {match['section_id']} - {match['section_title'][:60]}...")

    # Convert to DocumentSection objects
    final_document_sections = []
    current_part = None # Track current part for 10Q item context

    for i, match in enumerate(final_matches):
        start_pos = match['start_pos']
        # The content for this section goes from its start_pos to the start_pos of the *next* matched section
        # or to the end of the entire document if it's the last section.
        end_pos = final_matches[i + 1]['start_pos'] if i + 1 < len(final_matches) else len(content)

        # CRITICAL FIX: Ensure section_content is correctly sliced from the original content
        section_content = content[start_pos:end_pos].strip()

        section_id = match['section_id'].upper()
        title = match['section_title']

        section_type = 'content' # Default type
        item_number = None
        part = None

        if re.match(r'^[IVX]+$', section_id):
            section_type = 'part'
            part = f"PART {section_id}"
            current_part = part # Update current part for subsequent items
            # Refine title: remove "PART X" if it's already in the title to avoid redundancy.
            clean_title_part = title.upper().replace(part, '').strip(' -.')
            if clean_title_part:
                title = f"{part} - {clean_title_part}"
            else:
                title = part # Fallback to just "PART X"
        elif re.match(r'^\d+[A-C]?$', section_id):
            section_type = 'item'
            item_number = section_id
            part = current_part # Assign current part context to this item (inherited from last PART)
            # Refine title: remove "Item X" if it's already in the title
            clean_title_item = title.upper().replace(f"ITEM {item_number}", '').strip(' -.')
            if clean_title_item:
                title = f"Item {item_number} - {clean_title_item}"
            else:
                title = f"Item {item_number}" # Fallback to just "Item X"
        # For named_section (e.g., "BUSINESS" when it's not explicitly an Item number)
        elif any(keyword in title.upper() for keyword in ['BUSINESS', 'RISK FACTORS', 'LEGAL PROCEEDINGS', 'FINANCIAL STATEMENTS', 'MANAGEMENT\'S DISCUSSION', 'PROPERTIES', 'CONTROLS AND PROCEDURES']):
            section_type = 'named_section'


        final_document_sections.append(DocumentSection(
            title=title,
            content=section_content, # Pass the correctly sliced content
            section_type=section_type,
            item_number=item_number,
            part=part, # Store the part info (either detected directly or inherited)
            start_pos=start_pos,
            end_pos=end_pos
        ))

    return final_document_sections

def detect_sections_from_toc_universal(content: str) -> List[DocumentSection]:
    """
    Extract sections from table of contents - works for any SEC filing.
    This function primarily identifies section titles and item numbers from TOC,
    but does not extract their content directly.
    """
    sections = []

    if not content:
        logger.info("Empty content provided to detect_sections_from_toc_universal. Returning empty sections.")
        return sections

    # Look for table of contents patterns. Using re.escape for literal parts.
    toc_patterns = [
        re.compile(r'(?i)INDEX.*?(?=\s*--- PAGE BREAK ---)', re.DOTALL),
        re.compile(r'(?i)TABLE OF CONTENTS.*?(?=\s*--- PAGE BREAK ---)', re.DOTALL),
        re.compile(r'(?i)FORM 10-[KQ].*?INDEX.*?(?=\s*--- PAGE BREAK ---)', re.DOTALL),
        re.compile(re.escape('[TABLE_START]') + r'.*?Page.*?' + re.escape('[TABLE_END]') + r'.*?(?=\s*--- PAGE BREAK ---)', re.DOTALL),
    ]

    toc_content = ""
    for pattern in toc_patterns:
        match = pattern.search(content)
        if match:
            toc_content = match.group(0)
            break

    if not toc_content:
        logger.warning("No table of contents found in detect_sections_from_toc_universal.")
        return sections

    logger.info(f"Found table of contents ({len(toc_content)} chars)")

    # Define patterns for items/parts within the TOC.
    # CORRECTED: Significant refinement here. Focused on capturing clean IDs and titles.
    # Added more specific patterns to handle multi-column and sub-section structures.
    # Added stricter checks to avoid capturing noise.
    item_patterns = [
        # Pattern 1: Multi-column TOC entry with PART, Item, and Title (e.g., KO 10-Q)
        # Captures: (Optional Page Num) | PART ID | PART Title (Optional) | Item ID | Item Title (Optional) | Page Num
        # Group 1: PART ID, Group 2: PART Title, Group 3: Item ID, Group 4: Item Title
        re.compile(r'(?i)(?:Page\s*\|\s*)?\s*(PART\s*([IVX]+)\.?(?:\s*([^\n|]+?))?\s*\|\s*)?Item\s*(\d{1,2}[A-C]?)\.?\s*\|\s*([^|]+?)(?:\s*\|\s*\d+)?', re.M),
        
        # Pattern 2: Simpler Item/Part line with Title (e.g., "Item 1. | Financial Statements | 3")
        # Captures: Item/PART ID (Group 1), Title (Group 2)
        re.compile(r'(?i)(?:Item|PART)\s*(\d{1,2}[A-C]?|[IVX]+)\.?\s*\|\s*([^\n|]+?)(?:\s*\|\s*\d+)?', re.M),
        
        # Pattern 3: Standalone Item/Part line with Title (e.g., "Item 1A. Risk Factors" or "PART II. OTHER INFORMATION")
        # Captures: Item/PART ID (Group 1), Title (Group 2)
        re.compile(r'(?i)^\s*(?:Item|PART)\s*(\d{1,2}[A-C]?|[IVX]+)\.?\s*([^\n|]+)', re.M),
        
        # Pattern 4: Generic TOC titles, often sub-sections or long descriptions. Must be long enough to avoid noise.
        # Captures: Title (Group 1). Requires a reasonable length and start with a capital letter.
        # Made more robust to handle various characters and avoid matching short noise.
        re.compile(r'^\s*([A-Z][A-Za-z0-9\s\',&\(\)\-\.]{15,})\s*(?:\|\s*\d+)?$', re.M), # Min 15 chars, allow more symbols
        
        # Pattern 5: Simple "PART X" line (e.g., "PART I")
        re.compile(r'(?i)^\s*PART\s*([IVX]+)\s*$', re.M),
        
        # Pattern 6: Number-dot format (e.g., "1. Business") usually at start of line
        # Captures Item ID (Group 1), Title (Group 2)
        re.compile(r'^\s*(\d{1,2}[A-C]?)\.\s*([^\n|]+)', re.M),
    ]


    found_items = []
    current_part_id_context = None # To associate items with the last seen part

    if toc_content:
        for line in toc_content.split('\n'):
            line = line.strip()
            if not line:
                continue
            
            # Skip lines that are likely just TOC headers/footers or page numbers or short noisy lines
            if any(kw in line.lower() for kw in ['page', 'item', 'part', 'description']) and len(line) < 20:
                continue
            if re.match(r'^\s*\d+\s*$', line.strip()): # Just a page number
                continue
            if re.match(r'^\s*(\d{1,2}[A-C]?)\s*$', line.strip()): # Just "1" or "1A"
                continue
            if len(line) < 5: # Very short lines are likely noise
                continue


            for pattern in item_patterns:
                match = pattern.search(line)
                if match:
                    item_id = None
                    item_title = ""
                    section_type_raw = 'unknown'

                    if pattern == item_patterns[0]: # Pattern 1: Complex multi-column (Page | PART/ITEM | Item_ID. | Title)
                        part_id_cand = match.group(2) if match.group(2) else None # Group 2 for PART ID
                        item_id = match.group(3).strip() if match.group(3) else None # Group 3 for Item ID
                        item_title = match.group(4).strip() if match.group(4) else "" # Group 4 for Item Title
                        
                        if part_id_cand:
                            current_part_id_context = f"PART {part_id_cand}"
                            # Add the PART entry, title might be from group 1 if available and clean, else generic
                            part_title_from_group = match.group(1).strip() if match.group(1) else f"PART {part_id_cand}"
                            found_items.append((part_id_cand, part_title_from_group, 'part', current_part_id_context))
                        
                        if item_id:
                            section_type_raw = 'item'
                            found_items.append((item_id, item_title, section_type_raw, current_part_id_context))
                            break # Move to next line (matched)

                    elif pattern in [item_patterns[1], item_patterns[2], item_patterns[5]]: # Patterns with ID as group 1, Title as group 2 (or inferred from line)
                        item_id = match.group(1).strip() if match.group(1) else None
                        item_title = match.group(2).strip() if len(match.groups()) > 1 and match.group(2) else "" # Title captured by second group

                        is_item = re.match(r'^\d+[A-C]?$', item_id, re.I)
                        is_part = re.match(r'^[IVX]+$', item_id, re.I)

                        if is_item:
                            section_type_raw = 'item'
                            found_items.append((item_id, item_title, section_type_raw, current_part_id_context))
                            break
                        elif is_part:
                            section_type_raw = 'part'
                            current_part_id_context = f"PART {item_id}"
                            found_items.append((item_id, item_title, section_type_raw, current_part_id_context))
                            break
                    
                    elif pattern == item_patterns[3]: # Generic titles (Pattern 4: e.g., "Consolidated Statements of Cash Flows")
                        item_title = match.group(1).strip()
                        # Add sanity checks for extracted titles (e.g., not just numbers or very short)
                        if item_title and len(item_title) > 10 and not re.match(r'^\d+(\.\d+)?$', item_title.replace('.', '').strip()): # Not purely numeric
                             found_items.append((None, item_title, 'named_section', current_part_id_context))
                             break
                    
                    elif pattern == item_patterns[4]: # Simple "PART X" line (Pattern 5)
                        item_id = match.group(1).strip()
                        current_part_id_context = f"PART {item_id}"
                        found_items.append((item_id, f"PART {item_id}", 'part', current_part_id_context))
                        break

    # Deduplicate and create final DocumentSection objects.
    unique_items = []
    seen_keys = set()
    
    # Clean titles and associate with parts
    processed_items_for_dedup = []
    for item_data in found_items:
        item_id, title_raw, section_type_raw, part_context = item_data
        
        # Clean title to remove trailing page numbers or extraneous characters often seen in TOCs
        cleaned_title = re.sub(r'\|\s*\d+\s*$', '', title_raw).strip() # Remove "| PageNum" from end
        cleaned_title = re.sub(r'\s*\.\s*$', '', cleaned_title).strip() # Remove trailing periods
        cleaned_title = re.sub(r'\[TABLE_END\]\s*.*', '', cleaned_title, flags=re.I).strip() # Remove table end markers
        cleaned_title = re.sub(r'\s+', ' ', cleaned_title).strip() # Normalize internal whitespace
        
        # Filter out titles that are just numbers or very short/uninformative after cleaning
        if not cleaned_title or len(cleaned_title) < 5 or re.match(r'^\d+(\.\d+)?$', cleaned_title):
            continue

        processed_items_for_dedup.append({
            'item_id': item_id,
            'title': cleaned_title,
            'type': section_type_raw,
            'part': part_context
        })

    # Sort and deduplicate
    processed_items_for_dedup.sort(key=lambda x: (x['part'] if x['part'] else '', x['item_id'] if x['item_id'] else '', x['title']))

    for item in processed_items_for_dedup:
        key = (item['item_id'], item['title'], item['type'], item['part'])
        if key not in seen_keys:
            unique_items.append(DocumentSection(
                title=item['title'],
                content="", # Content still empty, to be filled by main strategy
                section_type=item['type'],
                item_number=item['item_id'] if item['type'] == 'item' else None,
                part=item['part'],
                start_pos=0,
                end_pos=0
            ))
            seen_keys.add(key)
    
    logger.info(f"Extracted {len(unique_items)} sections from table of contents:")
    for i, sec in enumerate(unique_items[:15]): # Show more for debugging TOC
        logger.info(f"  • ID: {sec.item_number if sec.item_number else sec.part if sec.part else 'None'}, Type: {sec.section_type}, Title: {sec.title[:60]}...")

    return unique_items


def detect_sections_robust_universal(content: str) -> List[DocumentSection]:
    """
    Universal robust section detection for all SEC filings.
    Prioritizes direct pattern matching (which handles tables well), then TOC, then page-based.
    """
    logger.info("Attempting universal SEC section detection")

    # Strategy 1: Direct pattern matching for sections (designed to work well with common SEC patterns)
    sections_strategy1 = detect_sections_universal_sec(content)

    if len(sections_strategy1) >= 3:
        logger.info(f"Universal detection successful (Strategy 1): Found {len(sections_strategy1)} sections.")
        return sections_strategy1

    # Strategy 2: Try parsing Table of Contents.
    logger.warning("Direct detection found few sections, analyzing table of contents.")
    toc_entries = detect_sections_from_toc_universal(content) # These are DocumentSections with only title/metadata, no content

    if toc_entries and len(toc_entries) >= 3: # If TOC parsing yielded a good number of entries
        logger.info(f"TOC analysis found {len(toc_entries)} potential sections. Attempting to extract content based on TOC titles.")

        combined_sections = []
        current_content_pos = 0

        # TOC entries are already sorted by `detect_sections_from_toc_universal`
        # and filtered for quality.

        for i, toc_entry in enumerate(toc_entries):
            pattern_parts = []
            
            # Create highly flexible regex for matching TOC entry in main content
            # Account for variations in whitespace, periods, and potential parenthetical additions
            
            # Prioritize matching by Item/Part numbers if they exist
            if toc_entry.item_number:
                # Be flexible: "Item 1.", "Item 1A", "ITEM 1" etc.
                pattern_parts.append(r'Item\s*' + re.escape(toc_entry.item_number) + r'\.?')
            if toc_entry.part and toc_entry.part.startswith("PART "): # Ensure it's a valid PART string
                # Be flexible: "PART I", "PART II." etc.
                pattern_parts.append(r'PART\s*' + re.escape(toc_entry.part.replace("PART ", "")) + r'\.?')
            
            # Fallback to matching the full cleaned title from TOC
            if toc_entry.title:
                # Clean title for regex matching in content (remove page numbers, excess pipes, etc.)
                cleaned_title_for_regex = re.sub(r'\|\s*\d+', '', toc_entry.title).strip() # Remove "| PageNumber"
                cleaned_title_for_regex = re.sub(r'\s*\.\s*$', '', cleaned_title_for_regex).strip() # Remove trailing periods
                cleaned_title_for_regex = re.sub(r'\s+-\s+', r'\s*[-–—]?\s*', cleaned_title_for_regex) # Handle hyphens in titles
                cleaned_title_for_regex = re.sub(r'\s+', r'\s+', cleaned_title_for_regex) # Replace multiple spaces with \s+
                
                # Add word boundaries (\b) only if title is not too short, to prevent partial word matches
                if len(cleaned_title_for_regex) > 5: # Heuristic: add \b for longer titles
                    pattern_parts.append(r'\b' + re.escape(cleaned_title_for_regex) + r'\b')
                else:
                    pattern_parts.append(re.escape(cleaned_title_for_regex))
                
            if not pattern_parts:
                logger.warning(f"No valid pattern parts for TOC entry: '{toc_entry.title}'. Skipping.")
                continue

            # Combine all potential ways to match this section's header
            # Match at the beginning of a line, allowing leading whitespace.
            # Use non-capturing groups (?:...) where applicable.
            search_pattern = re.compile(r'(?i)^\s*(?:' + '|'.join(pattern_parts) + r')', re.M)
            
            match = search_pattern.search(content, pos=current_content_pos)

            if match:
                start_pos = match.start()
                
                next_start_pos = len(content)
                if i + 1 < len(toc_entries): # Check the next entry in the *sorted* list
                    next_toc_entry = toc_entries[i+1]
                    next_pattern_parts = []
                    if next_toc_entry.item_number:
                        next_pattern_parts.append(r'Item\s*' + re.escape(next_toc_entry.item_number) + r'\.?')
                    elif next_toc_entry.part and next_toc_entry.part.startswith("PART "):
                        next_pattern_parts.append(r'PART\s*' + re.escape(next_toc_entry.part.replace("PART ", "")) + r'\.?')
                    if next_toc_entry.title:
                        next_cleaned_title_for_regex = re.sub(r'\|\s*\d+', '', next_toc_entry.title).strip()
                        next_cleaned_title_for_regex = re.sub(r'\s*\.\s*$', '', next_cleaned_title_for_regex).strip()
                        next_cleaned_title_for_regex = re.sub(r'\s+-\s+', r'\s*[-–—]?\s*', next_cleaned_title_for_regex)
                        next_cleaned_title_for_regex = re.sub(r'\s+', r'\s+', next_cleaned_title_for_regex)
                        if len(next_cleaned_title_for_regex) > 5:
                            next_pattern_parts.append(r'\b' + re.escape(next_cleaned_title_for_regex) + r'\b')
                        else:
                            next_pattern_parts.append(re.escape(next_cleaned_title_for_regex))

                    if next_pattern_parts:
                        next_pattern = re.compile(r'(?i)^\s*(?:' + '|'.join(next_pattern_parts) + r')', re.M)
                        next_match = next_pattern.search(content, pos=match.end()) # Search from end of current match
                        if next_match:
                            next_start_pos = next_match.start()
                
                section_content = content[start_pos:next_start_pos].strip()
                
                combined_sections.append(DocumentSection(
                    title=toc_entry.title,
                    content=section_content,
                    section_type=toc_entry.section_type,
                    item_number=toc_entry.item_number,
                    part=toc_entry.part,
                    start_pos=start_pos,
                    end_pos=next_start_pos
                ))
                current_content_pos = next_start_pos
            else:
                logger.warning(f"Could not find content for TOC entry: '{toc_entry.title}'. This section might be merged with previous or skipped.")

        if len(combined_sections) >= 3: # Only consider TOC mapping successful if it yields a good number of sections
            logger.info(f"Universal detection successful (TOC-based content mapping): Found {len(combined_sections)} sections.")
            return combined_sections
        else:
            logger.warning("TOC-based content mapping yielded few sections. Falling back to page-based detection.")


    # Strategy 3: Page-based fallback (original strategy 2)
    logger.warning("Trying page-based detection as fallback.")
    sections_strategy2 = detect_sections_strategy_2(content)

    if len(sections_strategy2) >= 2:
        logger.info(f"Page-based detection successful: Found {len(sections_strategy2)} sections.")
        return sections_strategy2

    # Final fallback: return the entire document as a single section
    logger.warning("All strategies failed, creating single section.")
    return [DocumentSection(
        title="Full Document",
        content=content,
        section_type='document',
        start_pos=0,
        end_pos=len(content)
    )]

In [15]:
results_universal = test_universal_detection_fixed()
old_vs_new_sections = compare_old_vs_universal_fixed()
quick_pattern_test_fixed()

INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 19 unique sections:
INFO:__main__:  1: Item/Part 1 - Business...
INFO:__main__:  2: Item/Part 1A - Risk Factors...
INFO:__main__:  3: Item/Part 1B - Unresolved Staff Comments...
INFO:__main__:  4: Item/Part 3 - Legal Proceedings...
INFO:__main__:  5: Item/Part 4 - Mine Safety Disclosures...
INFO:__main__:  6: Item/Part 5 - Market for Registrant’s Common Equity, Related Stockholder M...
INFO:__main__:  7: Item/Part 6 - Selected Financial Data...
INFO:__main__:  8: Item/Part 7 - Management’s Discussion and Analysis of Financial Condition ...
INFO:__main__:  9: Item/Part 7A - Quantitative and Qualitative Disclosures About Market Risk...
INFO:__main__:  10: Item/Part 8 - Financial Statements and Supplementary Data...
INFO:__main__:  11: Item/Part 9 - Changes in and Disagreements with Accountants on Accounting ...
INFO:__main__:  12: Item/Part 9A - Controls and Procedures...
INFO:__main__:


🧪 Testing: processed_filings/AAPL/AAPL_10K_2020-10-30.txt


✅ Found 19 sections:

  1. Item 1 - BUSINESS

     Type: item, Length: 13,266 chars

  2. Item 1A - RISK FACTORS

     Type: item, Length: 61,136 chars

  3. Item 1B - UNRESOLVED STAFF COMMENTS

     Type: item, Length: 582 chars

  4. Item 3 - LEGAL PROCEEDINGS

     Type: item, Length: 898 chars

  5. Item 4 - MINE SAFETY DISCLOSURES

     Type: item, Length: 108 chars

  6. Item 5 - MARKET FOR REGISTRANT’S COMMON EQUITY, RELATED STOCKHOLDER MATTERS AND ISSUER PURCHASES OF EQUITY SECURITIES

     Type: item, Length: 4,182 chars

  7. Item 6 - SELECTED FINANCIAL DATA

     Type: item, Length: 1,745 chars

  8. Item 7 - MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIAL CONDITION AND RESULTS OF OPERATIONS

     Type: item, Length: 33,154 chars

  9. Item 7A - QUANTITATIVE AND QUALITATIVE DISCLOSURES ABOUT MARKET RISK

     Type: item, Length: 6,799 chars

  10. Item 8 - FINANCIAL STATEMENTS AND SUPPLEMENTARY DATA

     Type: i

INFO:__main__:  3: Item/Part 2 - Management’s Discussion and Analysis of Financial Condition ...
INFO:__main__:  4: Item/Part 3 - Quantitative and Qualitative Disclosures About Market Risk...
INFO:__main__:  5: Item/Part 4 - Controls and Procedures...
INFO:__main__:  6: Item/Part 1 - Legal Proceedings...
INFO:__main__:  7: Item/Part 1A - Risk Factors...
INFO:__main__:  8: Item/Part 2 - Unregistered Sales of Equity Securities and Use of Proceeds...
INFO:__main__:  9: Item/Part 3 - Defaults Upon Senior Securities...
INFO:__main__:  10: Item/Part 5 - Other Information...
INFO:__main__:  11: Item/Part 6 - Exhibits...
INFO:__main__:Universal detection successful (Strategy 1): Found 11 sections.
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 0 unique sections:
INFO:__main__:Found table of contents (903 chars)
INFO:__main__:Extracted 1 sections from table of contents:
INFO:__main__:  • ID: PART I, Type: part, Title: PART I. FINANCIAL INF


✅ Found 11 sections:

  1. Item 1 - FINANCIAL STATEMENTS

     Type: item, Length: 34,940 chars

  2. Legal Proceedings

     Type: named_section, Length: 32,116 chars

  3. Item 2 - MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIAL CONDITION AND RESULTS OF OPERATIONS

     Type: item, Length: 45,107 chars

  4. Item 3 - QUANTITATIVE AND QUALITATIVE DISCLOSURES ABOUT MARKET RISK

     Type: item, Length: 4,405 chars

  5. Item 4 - CONTROLS AND PROCEDURES

     Type: item, Length: 2,104 chars

  6. Item 1 - LEGAL PROCEEDINGS

     Type: item, Length: 162 chars

  7. Item 1A - RISK FACTORS

     Type: item, Length: 59,433 chars

  8. Item 2 - UNREGISTERED SALES OF EQUITY SECURITIES AND USE OF PROCEEDS

     Type: item, Length: 103 chars

  9. Item 3 - DEFAULTS UPON SENIOR SECURITIES

     Type: item, Length: 153 chars

  10. Item 5 - OTHER INFORMATION

     Type: item, Length: 3,031 chars


📊 Processing Results:

  total_chunks: 132

  avg_tokens: 366.43939393939394

  min_tokens: 7

  

In [None]:
def detect_sections_universal_sec(content: str) -> List[DocumentSection]:
    """
    Universal section detection for SEC filings with table-based formatting.
    Improved regex patterns for better capture of Item/Part numbers and titles.
    Ensures content for each DocumentSection is correctly sliced.
    """
    sections = []

    if not content:
        logger.info("Empty content provided to detect_sections_universal_sec. Returning empty sections.")
        return sections

    # Universal patterns for table-formatted SEC filings
    # Using raw strings `r` and explicitly handling whitespace `\s*` and literal characters.
    # Compiling patterns once for efficiency.
    patterns = [
        # Table-based ITEM patterns: e.g., "[TABLE_START] Item 1. | Business..."
        re.compile(r'(?i)\[TABLE_START\]\s*Item\s*(\d{1,2}[A-C]?)\.?\s*\|\s*([^\[]+?)\s*\[TABLE_END\]', re.DOTALL),
        re.compile(r'(?i)\[TABLE_START\]\s*Item\s*(\d{1,2}[A-C]?)\.?\s*\|\s*([^|]+)', re.DOTALL),

        # Table-based PART patterns: e.g., "[TABLE_START] PART I | FINANCIAL INFORMATION..."
        re.compile(r'(?i)\[TABLE_START\]\s*PART\s*([IVX]+)\s*\|\s*([^\[]+?)\s*\[TABLE_END\]', re.DOTALL),
        re.compile(r'(?i)\[TABLE_START\]\s*PART\s*([IVX]+)\s*\|\s*([^|]+)', re.DOTALL),
        re.compile(r'(?i)\[TABLE_START\]\s*PART\s*([IVX]+)\s*\[TABLE_END\]', re.DOTALL),

        # Standalone ITEM patterns (strong indicators, start of line): e.g., "Item 1. Business"
        re.compile(r'^\s*Item\s*(\d{1,2}[A-C]?)\.?\s*([^\n]+)', re.I | re.M),
        # Standalone ITEM patterns (pipe-separated but not necessarily table-wrapped): e.g., "Item 1. | Business"
        re.compile(r'Item\s*(\d{1,2}[A-C]?)\.?\s*\|\s*([^|]+)', re.I | re.DOTALL),

        # Standalone PART patterns (strong indicators, start of line): e.g., "PART I. FINANCIAL INFORMATION"
        re.compile(r'^\s*PART\s*([IVX]+)\.?\s*([^\n]*)', re.I | re.M),
        # Standalone PART patterns (pipe-separated): e.g., "PART I | FINANCIAL INFORMATION"
        re.compile(r'PART\s*([IVX]+)\s*\|\s*([^|]+)', re.I | re.DOTALL),

        # Number-dot format (e.g., "1. Business" not necessarily preceded by "Item", usually at start of line)
        re.compile(r'^\s*(\d{1,2}[A-C]?)\.\s+[A-Z][A-Za-z\s]{10,}', re.I | re.M),
        # Number-only pattern in tables (e.g., "[TABLE_START] 1. | Business")
        re.compile(r'(?i)\[TABLE_START\]\s*(\d{1,2}[A-C]?)\.?\s*\|\s*([^|]+)', re.I | re.DOTALL),

        # Generic Section Titles that often appear as headers (e.g., "BUSINESS", "RISK FACTORS")
        re.compile(r'^\s*(BUSINESS|RISK FACTORS|LEGAL PROCEEDINGS|FINANCIAL STATEMENTS|MANAGEMENT\'S DISCUSSION AND ANALYSIS OF FINANCIAL CONDITION AND RESULTS OF OPERATIONS|PROPERTIES|CONTROLS AND PROCEDURES)\s*$', re.I | re.M)
    ]

    all_matches = []

    for pattern_idx, pattern in enumerate(patterns):
        for match in pattern.finditer(content):
            # Determine content boundaries for the "line" containing the match
            line_start = content.rfind('\n', 0, match.start()) + 1
            line_end = content.find('\n', match.end())
            if line_end == -1:
                line_end = len(content)

            full_line = content[line_start:line_end].strip()

            # Filter out obvious false positives (e.g., content that looks like a header but isn't)
            if (len(full_line) > 400 or  # Too long to be a header
                len(full_line) < 3 or    # Too short (e.g., just "1.")
                ('TABLE' in full_line.upper() and ('START' in full_line.upper() or 'END' in full_line.upper())) or # Exclude table markers if not part of a valid section header
                full_line.count(' ') > 20):  # Too many words, likely not a header
                continue

            # Heuristic to filter out TOC entries that might match general patterns
            if any(toc_indicator in full_line.lower() for toc_indicator in ['table of contents', 'index']):
                continue
            
            section_id = None
            section_title = full_line # Default to full line if specific extraction fails

            groups = match.groups()
            if groups:
                potential_id = groups[0].strip()
                # Determine if the first captured group is a valid Item/Part ID
                is_item_id = re.match(r'^\d+[A-C]?$', potential_id, re.I)
                is_part_id = re.match(r'^[IVX]+$', potential_id, re.I)

                if is_item_id or is_part_id:
                    section_id = potential_id
                    if len(groups) > 1 and groups[1]: # If a title group was also captured
                        section_title = groups[1].strip()
                        # Clean up title: remove trailing table markers like "[TABLE_END]" if they were captured
                        section_title = re.sub(r'\[TABLE_END\]\s*.*', '', section_title, flags=re.I).strip()
                        section_title = section_title.replace('|', '').strip() # Remove pipe characters
                    else: # No explicit title captured by a group
                        # Try to extract a clean title from the remainder of the line after the ID
                        remaining_line_after_id = full_line[match.end() - line_start:].strip()
                        clean_line = re.sub(r'^\s*\.?\s*[-–—]?\s*', '', remaining_line_after_id).strip()
                        if clean_line and len(clean_line) < 200: # Ensure extracted title isn't too long
                            section_title = clean_line
                        else:
                             section_title = full_line # Fallback to full line if cleaning is problematic
                else: # First captured group was not a standard Item/Part ID, treat as part of title
                    section_title = full_line
                    # For generic named sections (e.g., "BUSINESS"), assign a canonical ID if not part of an Item/Part already
                    if 'BUSINESS' in full_line.upper() and not is_item_id and not is_part_id: section_id = '1'
                    elif 'RISK FACTORS' in full_line.upper() and not is_item_id and not is_part_id: section_id = '1A'
                    # Add other named section mappings if needed.

            # Store the actual start and end positions of the matched content within the document
            all_matches.append({
                'start_pos': match.start(), # Use match.start() for the *exact* start of the regex match
                'end_pos': match.end(),     # Use match.end() for the *exact* end of the regex match
                'full_line': full_line, # Keep for debugging/context
                'section_id': section_id if section_id else 'unknown',
                'section_title': section_title,
                'pattern_idx': pattern_idx,
                'match_start': match.start()
            })

    # Sort matches primarily by start_pos, secondarily by pattern_idx (to prefer more specific patterns early in the list)
    all_matches.sort(key=lambda x: (x['start_pos'], x['pattern_idx']))

    # Filter duplicate/overlapping matches. Prioritize more specific patterns (lower pattern_idx).
    final_matches = []
    if all_matches:
        final_matches.append(all_matches[0])
        for i in range(1, len(all_matches)):
            current_match = all_matches[i]
            last_added_match = final_matches[-1]

            # If current match starts very close to the last added match,
            # consider if it's a duplicate or a better alternative.
            if current_match['start_pos'] - last_added_match['start_pos'] < 100: # Within 100 chars
                # Prefer matches with a specific Item/Part ID over 'unknown' or less specific types
                if current_match['section_id'] != 'unknown' and last_added_match['section_id'] == 'unknown':
                    final_matches[-1] = current_match
                # If both are specific, prefer the one matched by a higher-priority pattern (lower index means earlier in list)
                elif current_match['section_id'] != 'unknown' and last_added_match['section_id'] != 'unknown' and current_match['pattern_idx'] < last_added_match['pattern_idx']:
                    final_matches[-1] = current_match
                # If they have the same ID but the new match offers a cleaner/more robust title
                elif current_match['section_id'] == last_added_match['section_id'] and len(current_match['section_title']) < len(last_added_match['section_title']) * 0.8: # Heuristic for "cleaner"
                     final_matches[-1] = current_match
                # Otherwise, if it's too close and not a better candidate, skip as duplicate
            else:
                final_matches.append(current_match) # Add if sufficiently far apart

    logger.info(f"🔍 Universal SEC detection found {len(final_matches)} unique sections:")
    for i, match in enumerate(final_matches[:15]):
        logger.info(f"  {i+1}: Item/Part {match['section_id']} - {match['section_title'][:60]}...")

    # Convert to DocumentSection objects
    final_document_sections = []
    current_part = None # Track current part for 10Q item context

    for i, match in enumerate(final_matches):
        start_pos = match['start_pos']
        # The content for this section goes from its start_pos to the start_pos of the *next* matched section
        # or to the end of the entire document if it's the last section.
        end_pos = final_matches[i + 1]['start_pos'] if i + 1 < len(final_matches) else len(content)

        # CRITICAL FIX: Ensure section_content is correctly sliced from the original content
        section_content = content[start_pos:end_pos].strip()

        section_id = match['section_id'].upper()
        title = match['section_title']

        section_type = 'content' # Default type
        item_number = None
        part = None

        if re.match(r'^[IVX]+$', section_id):
            section_type = 'part'
            part = f"PART {section_id}"
            current_part = part # Update current part for subsequent items
            # Refine title: remove "PART X" if it's already in the title to avoid redundancy.
            clean_title_part = title.upper().replace(part, '').strip(' -.')
            if clean_title_part:
                title = f"{part} - {clean_title_part}"
            else:
                title = part # Fallback to just "PART X"
        elif re.match(r'^\d+[A-C]?$', section_id):
            section_type = 'item'
            item_number = section_id
            part = current_part # Assign current part context to this item (inherited from last PART)
            # Refine title: remove "Item X" if it's already in the title
            clean_title_item = title.upper().replace(f"ITEM {item_number}", '').strip(' -.')
            if clean_title_item:
                title = f"Item {item_number} - {clean_title_item}"
            else:
                title = f"Item {item_number}" # Fallback to just "Item X"
        # For named_section (e.g., "BUSINESS" when it's not explicitly an Item number)
        elif any(keyword in title.upper() for keyword in ['BUSINESS', 'RISK FACTORS', 'LEGAL PROCEEDINGS', 'FINANCIAL STATEMENTS', 'MANAGEMENT\'S DISCUSSION', 'PROPERTIES', 'CONTROLS AND PROCEDURES']):
            section_type = 'named_section'


        final_document_sections.append(DocumentSection(
            title=title,
            content=section_content, # Pass the correctly sliced content
            section_type=section_type,
            item_number=item_number,
            part=part, # Store the part info (either detected directly or inherited)
            start_pos=start_pos,
            end_pos=end_pos
        ))

    return final_document_sections

def detect_sections_from_toc_universal(content: str) -> List[DocumentSection]:
    """
    Extract sections from table of contents - works for any SEC filing.
    This function primarily identifies section titles and item numbers from TOC,
    but does not extract their content directly.
    """
    sections = []

    if not content:
        logger.info("Empty content provided to detect_sections_from_toc_universal. Returning empty sections.")
        return sections

    # Look for table of contents patterns. Using re.escape for literal parts.
    toc_patterns = [
        re.compile(r'(?i)INDEX.*?(?=\s*--- PAGE BREAK ---)', re.DOTALL),
        re.compile(r'(?i)TABLE OF CONTENTS.*?(?=\s*--- PAGE BREAK ---)', re.DOTALL),
        re.compile(r'(?i)FORM 10-[KQ].*?INDEX.*?(?=\s*--- PAGE BREAK ---)', re.DOTALL),
        re.compile(re.escape('[TABLE_START]') + r'.*?Page.*?' + re.escape('[TABLE_END]') + r'.*?(?=\s*--- PAGE BREAK ---)', re.DOTALL),
    ]

    toc_content = ""
    for pattern in toc_patterns:
        match = pattern.search(content)
        if match:
            toc_content = match.group(0)
            break

    if not toc_content:
        logger.warning("No table of contents found in detect_sections_from_toc_universal.")
        return sections

    logger.info(f"Found table of contents ({len(toc_content)} chars)")

    # Define patterns for items/parts within the TOC.
    # CORRECTED: Significant refinement here. Focused on capturing clean IDs and titles.
    # Added more specific patterns to handle multi-column and sub-section structures.
    # Added stricter checks to avoid capturing noise.
    item_patterns = [
        # Pattern 1: Multi-column TOC entry with PART, Item, and Title (e.g., KO 10-Q)
        # Captures: (Optional Page Num) | PART ID | PART Title (Optional) | Item ID | Item Title (Optional) | Page Num
        # Group 1: PART ID, Group 2: PART Title, Group 3: Item ID, Group 4: Item Title
        re.compile(r'(?i)(?:Page\s*\|\s*)?\s*(PART\s*([IVX]+)\.?(?:\s*([^\n|]+?))?\s*\|\s*)?Item\s*(\d{1,2}[A-C]?)\.?\s*\|\s*([^|]+?)(?:\s*\|\s*\d+)?', re.M),
        
        # Pattern 2: Simpler Item/Part line with Title (e.g., "Item 1. | Financial Statements | 3")
        # Captures: Item/PART ID (Group 1), Title (Group 2)
        re.compile(r'(?i)(?:Item|PART)\s*(\d{1,2}[A-C]?|[IVX]+)\.?\s*\|\s*([^\n|]+?)(?:\s*\|\s*\d+)?', re.M),
        
        # Pattern 3: Standalone Item/Part line with Title (e.g., "Item 1A. Risk Factors" or "PART II. OTHER INFORMATION")
        # Captures: Item/PART ID (Group 1), Title (Group 2)
        re.compile(r'(?i)^\s*(?:Item|PART)\s*(\d{1,2}[A-C]?|[IVX]+)\.?\s*([^\n|]+)', re.M),
        
        # Pattern 4: Generic TOC titles, often sub-sections or long descriptions. Must be long enough to avoid noise.
        # Captures: Title (Group 1). Requires a reasonable length and start with a capital letter.
        # Made more robust to handle various characters and avoid matching short noise.
        re.compile(r'^\s*([A-Z][A-Za-z0-9\s\',&\(\)\-\.]{15,})\s*(?:\|\s*\d+)?$', re.M), # Min 15 chars, allow more symbols
        
        # Pattern 5: Simple "PART X" line (e.g., "PART I")
        re.compile(r'(?i)^\s*PART\s*([IVX]+)\s*$', re.M),
        
        # Pattern 6: Number-dot format (e.g., "1. Business") usually at start of line
        # Captures Item ID (Group 1), Title (Group 2)
        re.compile(r'^\s*(\d{1,2}[A-C]?)\.\s*([^\n|]+)', re.M),
    ]


    found_items = []
    current_part_id_context = None # To associate items with the last seen part

    if toc_content:
        for line in toc_content.split('\n'):
            line = line.strip()
            if not line:
                continue
            
            # Skip lines that are likely just TOC headers/footers or page numbers or short noisy lines
            if any(kw in line.lower() for kw in ['page', 'item', 'part', 'description']) and len(line) < 20:
                continue
            if re.match(r'^\s*\d+\s*$', line.strip()): # Just a page number
                continue
            if re.match(r'^\s*(\d{1,2}[A-C]?)\s*$', line.strip()): # Just "1" or "1A"
                continue
            if len(line) < 5: # Very short lines are likely noise
                continue


            for pattern in item_patterns:
                match = pattern.search(line)
                if match:
                    item_id = None
                    item_title = ""
                    section_type_raw = 'unknown'

                    if pattern == item_patterns[0]: # Pattern 1: Complex multi-column (Page | PART/ITEM | Item_ID. | Title)
                        part_id_cand = match.group(2) if match.group(2) else None # Group 2 for PART ID
                        item_id = match.group(3).strip() if match.group(3) else None # Group 3 for Item ID
                        item_title = match.group(4).strip() if match.group(4) else "" # Group 4 for Item Title
                        
                        if part_id_cand:
                            current_part_id_context = f"PART {part_id_cand}"
                            # Add the PART entry, title might be from group 1 if available and clean, else generic
                            part_title_from_group = match.group(1).strip() if match.group(1) else f"PART {part_id_cand}"
                            found_items.append((part_id_cand, part_title_from_group, 'part', current_part_id_context))
                        
                        if item_id:
                            section_type_raw = 'item'
                            found_items.append((item_id, item_title, section_type_raw, current_part_id_context))
                            break # Move to next line (matched)

                    elif pattern in [item_patterns[1], item_patterns[2], item_patterns[5]]: # Patterns with ID as group 1, Title as group 2 (or inferred from line)
                        item_id = match.group(1).strip() if match.group(1) else None
                        item_title = match.group(2).strip() if len(match.groups()) > 1 and match.group(2) else ""

                        is_item = re.match(r'^\d+[A-C]?$', item_id, re.I)
                        is_part = re.match(r'^[IVX]+$', item_id, re.I)

                        if is_item:
                            section_type_raw = 'item'
                            found_items.append((item_id, item_title, section_type_raw, current_part_id_context))
                            break
                        elif is_part:
                            section_type_raw = 'part'
                            current_part_id_context = f"PART {item_id}"
                            found_items.append((item_id, item_title, section_type_raw, current_part_id_context))
                            break
                    
                    elif pattern == item_patterns[3]: # Generic titles (Pattern 4: e.g., "Consolidated Statements of Cash Flows")
                        item_title = match.group(1).strip()
                        # Add sanity checks for extracted titles (e.g., not just numbers or very short)
                        if item_title and len(item_title) > 10 and not re.match(r'^\d+(\.\d+)?$', item_title.replace('.', '').strip()): # Not purely numeric
                             found_items.append((None, item_title, 'named_section', current_part_id_context))
                             break
                    
                    elif pattern == item_patterns[4]: # Simple "PART X" line (Pattern 5)
                        item_id = match.group(1).strip()
                        current_part_id_context = f"PART {item_id}"
                        found_items.append((item_id, f"PART {item_id}", 'part', current_part_id_context))
                        break

    # Deduplicate and create final DocumentSection objects.
    unique_items = []
    seen_keys = set()
    
    # Process found_items to ensure correct part context is applied and clean titles
    processed_items_for_dedup = []
    for item_data in found_items:
        item_id, title_raw, section_type_raw, part_context = item_data
        
        # Clean title to remove trailing page numbers or extraneous characters often seen in TOCs
        cleaned_title = re.sub(r'\|\s*\d+\s*$', '', title_raw).strip() # Remove "| PageNum" from end
        cleaned_title = re.sub(r'\s*\.\s*$', '', cleaned_title).strip() # Remove trailing periods
        cleaned_title = re.sub(r'\[TABLE_END\]\s*.*', '', cleaned_title, flags=re.I).strip() # Remove table end markers
        cleaned_title = re.sub(r'\s+', ' ', cleaned_title).strip() # Normalize internal whitespace
        
        # Filter out titles that are just numbers or very short/uninformative after cleaning
        if not cleaned_title or len(cleaned_title) < 5 or re.match(r'^\d+(\.\d+)?$', cleaned_title):
            continue

        processed_items_for_dedup.append({
            'item_id': item_id,
            'title': cleaned_title,
            'type': section_type_raw,
            'part': part_context
        })

    # Sort and deduplicate
    processed_items_for_dedup.sort(key=lambda x: (x['part'] if x['part'] else '', x['item_id'] if x['item_id'] else '', x['title']))

    for item in processed_items_for_dedup:
        key = (item['item_id'], item['title'], item['type'], item['part'])
        if key not in seen_keys:
            unique_items.append(DocumentSection(
                title=item['title'],
                content="", # Content still empty, to be filled by main strategy
                section_type=item['type'],
                item_number=item['item_id'] if item['type'] == 'item' else None,
                part=item['part'],
                start_pos=0,
                end_pos=0
            ))
            seen_keys.add(key)
    
    logger.info(f"Extracted {len(unique_items)} sections from table of contents:")
    for i, sec in enumerate(unique_items[:15]): # Show more for debugging TOC
        logger.info(f"  • ID: {sec.item_number if sec.item_number else sec.part if sec.part else 'None'}, Type: {sec.section_type}, Title: {sec.title[:60]}...")

    return unique_items


def detect_sections_robust_universal(content: str) -> List[DocumentSection]:
    """
    Universal robust section detection for all SEC filings.
    Prioritizes direct pattern matching (which handles tables well), then TOC, then page-based.
    """
    logger.info("Attempting universal SEC section detection")

    # Strategy 1: Direct pattern matching for sections (designed to work well with common SEC patterns)
    sections_strategy1 = detect_sections_universal_sec(content)

    if len(sections_strategy1) >= 3:
        logger.info(f"Universal detection successful (Strategy 1): Found {len(sections_strategy1)} sections.")
        return sections_strategy1

    # Strategy 2: Try parsing Table of Contents.
    logger.warning("Direct detection found few sections, analyzing table of contents.")
    toc_entries = detect_sections_from_toc_universal(content) # These are DocumentSections with only title/metadata, no content

    if toc_entries and len(toc_entries) >= 3: # If TOC parsing yielded a good number of entries
        logger.info(f"TOC analysis found {len(toc_entries)} potential sections. Attempting to extract content based on TOC titles.")

        combined_sections = []
        current_content_pos = 0

        # TOC entries are already sorted by `detect_sections_from_toc_universal`
        # and filtered for quality.

        for i, toc_entry in enumerate(toc_entries):
            pattern_parts = []
            
            # Create highly flexible regex for matching TOC entry in main content
            # Account for variations in whitespace, periods, and potential parenthetical additions
            
            # Prioritize matching by Item/Part numbers if they exist
            if toc_entry.item_number:
                # Be flexible: "Item 1.", "Item 1A", "ITEM 1" etc.
                pattern_parts.append(r'Item\s*' + re.escape(toc_entry.item_number) + r'\.?')
            if toc_entry.part and toc_entry.part.startswith("PART "): # Ensure it's a valid PART string
                # Be flexible: "PART I", "PART II." etc.
                pattern_parts.append(r'PART\s*' + re.escape(toc_entry.part.replace("PART ", "")) + r'\.?')
            
            # Fallback to matching the full cleaned title from TOC
            if toc_entry.title:
                # Clean title for regex matching in content (remove page numbers, excess pipes, etc.)
                cleaned_title_for_regex = re.sub(r'\|\s*\d+', '', toc_entry.title).strip() # Remove "| PageNumber"
                cleaned_title_for_regex = re.sub(r'\s*\.\s*$', '', cleaned_title_for_regex).strip() # Remove trailing periods
                cleaned_title_for_regex = re.sub(r'\s+-\s+', r'\s*[-–—]?\s*', cleaned_title_for_regex) # Handle hyphens in titles
                cleaned_title_for_regex = re.sub(r'\s+', r'\s+', cleaned_title_for_regex) # Replace multiple spaces with \s+
                
                # Add word boundaries (\b) only if title is not too short, to prevent partial word matches
                # Made word boundaries optional for more flexibility as section titles may not strictly start/end on word boundaries
                if len(cleaned_title_for_regex) > 5: # Heuristic: add \b for longer titles
                    pattern_parts.append(r'\b?' + re.escape(cleaned_title_for_regex) + r'\b?') # Optional word boundaries
                else:
                    pattern_parts.append(re.escape(cleaned_title_for_regex))
                
            if not pattern_parts:
                logger.warning(f"No valid pattern parts for TOC entry: '{toc_entry.title}'. Skipping.")
                continue

            # Combine all potential ways to match this section's header
            # Match at the beginning of a line, allowing leading whitespace.
            # Use non-capturing groups (?:...) where applicable.
            search_pattern = re.compile(r'(?i)^\s*(?:' + '|'.join(pattern_parts) + r')', re.M)
            
            match = search_pattern.search(content, pos=current_content_pos)

            if match:
                start_pos = match.start()
                
                next_start_pos = len(content)
                if i + 1 < len(toc_entries): # Check the next entry in the *sorted* list
                    next_toc_entry = toc_entries[i+1]
                    next_pattern_parts = []
                    if next_toc_entry.item_number:
                        next_pattern_parts.append(r'Item\s*' + re.escape(next_toc_entry.item_number) + r'\.?')
                    elif next_toc_entry.part and next_toc_entry.part.startswith("PART "):
                        next_pattern_parts.append(r'PART\s*' + re.escape(next_toc_entry.part.replace("PART ", "")) + r'\.?')
                    if next_toc_entry.title:
                        next_cleaned_title_for_regex = re.sub(r'\|\s*\d+', '', next_toc_entry.title).strip()
                        next_cleaned_title_for_regex = re.sub(r'\s*\.\s*$', '', next_cleaned_title_for_regex).strip()
                        next_cleaned_title_for_regex = re.sub(r'\s+-\s+', r'\s*[-–—]?\s*', next_cleaned_title_for_regex)
                        next_cleaned_title_for_regex = re.sub(r'\s+', r'\s+', next_cleaned_title_for_regex)
                        if len(next_cleaned_title_for_regex) > 5:
                            next_pattern_parts.append(r'\b?' + re.escape(next_cleaned_title_for_regex) + r'\b?')
                        else:
                            next_pattern_parts.append(re.escape(next_cleaned_title_for_regex))

                    if next_pattern_parts:
                        next_pattern = re.compile(r'(?i)^\s*(?:' + '|'.join(next_pattern_parts) + r')', re.M)
                        next_match = next_pattern.search(content, pos=match.end()) # Search from end of current match
                        if next_match:
                            next_start_pos = next_match.start()
                
                section_content = content[start_pos:next_start_pos].strip()
                
                combined_sections.append(DocumentSection(
                    title=toc_entry.title,
                    content=section_content,
                    section_type=toc_entry.section_type,
                    item_number=toc_entry.item_number,
                    part=toc_entry.part,
                    start_pos=start_pos,
                    end_pos=next_start_pos
                ))
                current_content_pos = next_start_pos
            else:
                logger.warning(f"Could not find content for TOC entry: '{toc_entry.title}'. This section might be merged with previous or skipped.")

        if len(combined_sections) >= 3: # Only consider TOC mapping successful if it yields a good number of sections
            logger.info(f"Universal detection successful (TOC-based content mapping): Found {len(combined_sections)} sections.")
            return combined_sections
        else:
            logger.warning("TOC-based content mapping yielded few sections. Falling back to page-based detection.")


    # Strategy 3: Page-based fallback (original strategy 2)
    logger.warning("Trying page-based detection as fallback.")
    sections_strategy2 = detect_sections_strategy_2(content)

    if len(sections_strategy2) >= 2:
        logger.info(f"Page-based detection successful: Found {len(sections_strategy2)} sections.")
        return sections_strategy2

    # Final fallback: return the entire document as a single section
    logger.warning("All strategies failed, creating single section.")
    return [DocumentSection(
        title="Full Document",
        content=content,
        section_type='document',
        start_pos=0,
        end_pos=len(content)
    )]

In [16]:
results_universal = test_universal_detection_fixed()
old_vs_new_sections = compare_old_vs_universal_fixed()
quick_pattern_test_fixed()

INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 19 unique sections:
INFO:__main__:  1: Item/Part 1 - Business...
INFO:__main__:  2: Item/Part 1A - Risk Factors...
INFO:__main__:  3: Item/Part 1B - Unresolved Staff Comments...
INFO:__main__:  4: Item/Part 3 - Legal Proceedings...
INFO:__main__:  5: Item/Part 4 - Mine Safety Disclosures...
INFO:__main__:  6: Item/Part 5 - Market for Registrant’s Common Equity, Related Stockholder M...
INFO:__main__:  7: Item/Part 6 - Selected Financial Data...
INFO:__main__:  8: Item/Part 7 - Management’s Discussion and Analysis of Financial Condition ...
INFO:__main__:  9: Item/Part 7A - Quantitative and Qualitative Disclosures About Market Risk...
INFO:__main__:  10: Item/Part 8 - Financial Statements and Supplementary Data...
INFO:__main__:  11: Item/Part 9 - Changes in and Disagreements with Accountants on Accounting ...
INFO:__main__:  12: Item/Part 9A - Controls and Procedures...
INFO:__main__:


🧪 Testing: processed_filings/AAPL/AAPL_10K_2020-10-30.txt


✅ Found 19 sections:

  1. Item 1 - BUSINESS

     Type: item, Length: 13,266 chars

  2. Item 1A - RISK FACTORS

     Type: item, Length: 61,136 chars

  3. Item 1B - UNRESOLVED STAFF COMMENTS

     Type: item, Length: 582 chars

  4. Item 3 - LEGAL PROCEEDINGS

     Type: item, Length: 898 chars

  5. Item 4 - MINE SAFETY DISCLOSURES

     Type: item, Length: 108 chars

  6. Item 5 - MARKET FOR REGISTRANT’S COMMON EQUITY, RELATED STOCKHOLDER MATTERS AND ISSUER PURCHASES OF EQUITY SECURITIES

     Type: item, Length: 4,182 chars

  7. Item 6 - SELECTED FINANCIAL DATA

     Type: item, Length: 1,745 chars

  8. Item 7 - MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIAL CONDITION AND RESULTS OF OPERATIONS

     Type: item, Length: 33,154 chars

  9. Item 7A - QUANTITATIVE AND QUALITATIVE DISCLOSURES ABOUT MARKET RISK

     Type: item, Length: 6,799 chars

  10. Item 8 - FINANCIAL STATEMENTS AND SUPPLEMENTARY DATA

     Type: i

INFO:__main__:Created 172 chunks for AAPL_10K_2020-10-30.txt
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 21 unique sections:
INFO:__main__:  1: Item/Part 1 - Business...
INFO:__main__:  2: Item/Part 1A - Risk Factors...
INFO:__main__:  3: Item/Part 1B - Unresolved Staff Comments...
INFO:__main__:  4: Item/Part 2 - Properties...
INFO:__main__:  5: Item/Part 3 - Legal Proceedings...
INFO:__main__:  6: Item/Part 4 - Mine Safety Disclosures...
INFO:__main__:  7: Item/Part 5 - Market for the Registrant’s Common Stock, Related Shareholde...
INFO:__main__:  8: Item/Part 6 - Reserved...
INFO:__main__:  9: Item/Part 7A - Quantitative and Qualitative Disclosures About Market Risk...
INFO:__main__:  10: Item/Part 8 - Financial Statements and Supplementary Data...
INFO:__main__:  11: Item/Part unknown - Legal Proceedings...
INFO:__main__:  12: Item/Part 9 - Changes in and Disagreements with Accountants On Accounting ...
INFO:__main__:  13:


📊 Processing Results:

  total_chunks: 172

  avg_tokens: 379.86046511627904

  min_tokens: 38

  max_tokens: 1692

  chunks_with_overlap: 105

  table_chunks: 66

  narrative_chunks: 106

  unique_sections: 1


📚 Section Distribution (sample):

  • Full Document: 20 chunks


🧪 Testing: processed_filings/AMZN/AMZN_10K_2023-02-03.txt


✅ Found 21 sections:

  1. Item 1 - BUSINESS

     Type: item, Length: 13,286 chars

  2. Item 1A - RISK FACTORS

     Type: item, Length: 55,961 chars

  3. Item 1B - UNRESOLVED STAFF COMMENTS

     Type: item, Length: 107 chars

  4. Item 2 - PROPERTIES

     Type: item, Length: 1,438 chars

  5. Item 3 - LEGAL PROCEEDINGS

     Type: item, Length: 186 chars

  6. Item 4 - MINE SAFETY DISCLOSURES

     Type: item, Length: 123 chars

  7. Item 5 - MARKET FOR THE REGISTRANT’S COMMON STOCK, RELATED SHAREHOLDER MATTERS, AND ISSUER PURCHASES OF EQUITY SECURITIES

     Type: item, Length: 508 chars

  8. Item 6 - RESERVED

     Type: item, Length: 50,498 cha

INFO:__main__:Created 161 chunks for KO_10Q_2020-07-22.txt
INFO:__main__:Attempting Strategy 1: Regex-based section detection
INFO:__main__:🔍 Universal SEC detection found 22 unique sections:
INFO:__main__:  1: Item/Part I - PART I...
INFO:__main__:  2: Item/Part 1A - Risk Factors...
INFO:__main__:  3: Item/Part 1B - Unresolved Staff Comments...
INFO:__main__:  4: Item/Part 3 - Legal Proceedings...
INFO:__main__:  5: Item/Part 4 - Mine Safety Disclosures...
INFO:__main__:  6: Item/Part II - PART II...
INFO:__main__:  7: Item/Part 6 - Selected Financial Data...
INFO:__main__:  8: Item/Part 7 - Management’s Discussion and Analysis of Financial Condition ...
INFO:__main__:  9: Item/Part 7A - Quantitative and Qualitative Disclosures About Market Risk...
INFO:__main__:  10: Item/Part 8 - Financial Statements and Supplementary Data...
INFO:__main__:  11: Item/Part unknown - Notes to Consolidated Financial Statements...
INFO:__main__:  12: Item/Part unknown - Opinion on the Financial Statemen


📊 Processing Results:

  total_chunks: 161

  avg_tokens: 396.7577639751553

  min_tokens: 32

  max_tokens: 1451

  chunks_with_overlap: 97

  table_chunks: 63

  narrative_chunks: 98

  unique_sections: 1


📚 Section Distribution (sample):

  • Full Document: 20 chunks


📊 UNIVERSAL DETECTION SUMMARY

AAPL_10K_2020-10-30.txt   | 19 sections | 172 chunks

AMZN_10K_2023-02-03.txt   | 21 sections | 210 chunks

AMZN_10Q_2024-11-01.txt   | 11 sections | 132 chunks

KO_10Q_2020-07-22.txt     |  8 sections | 161 chunks

⚖️ OLD vs UNIVERSAL Detection Comparison

Running old detection...

Running universal detection...


📊 Comparison Results:

  Old detection: 22 sections

  Universal detection: 19 sections

  Improvement: +-3 sections


📋 Old Sections:

  1. PART I

  2. Item 1A - RISK FACTORS

  3. Item 1B - UNRESOLVED STAFF COMMENTS

  4. Item 3 - LEGAL PROCEEDINGS

  5. Item 4 - MINE SAFETY DISCLOSURES

  6. PART II

  7. Item 6 - SELECTED FINANCIAL DATA

  8. Item 7 - MANAGEMENT’S DISCU

In [18]:
import os
import re
import pandas as pd
import tiktoken
from typing import List, Dict, Any, Tuple, Optional
from dataclasses import dataclass
from datetime import datetime
import logging
from pathlib import Path

# Set up logging to see what's happening
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Initialize tokenizer for accurate token counting
encoding = tiktoken.encoding_for_model("text-embedding-3-small")

# =============================================================================
# 1. SEC MAPPINGS WITH FALLBACKS
# =============================================================================

ITEM_NAME_MAP_10K = {
    "1": "Business",
    "1A": "Risk Factors",
    "1B": "Unresolved Staff Comments",
    "1C": "Cybersecurity",
    "2": "Properties",
    "3": "Legal Proceedings",
    "4": "Mine Safety Disclosures",
    "5": "Market for Registrant's Common Equity, Related Stockholder Matters and Issuer Purchases of Equity Securities",
    "6": "Reserved",
    "7": "Management's Discussion and Analysis of Financial Condition and Results of Operations",
    "7A": "Quantitative and Qualitative Disclosures About Market Risk",
    "8": "Financial Statements and Supplementary Data",
    "9": "Changes in and Disagreements With Accountants on Accounting and Financial Disclosure",
    "9A": "Controls and Procedures",
    "9B": "Other Information",
    "9C": "Disclosure Regarding Foreign Jurisdictions that Prevent Inspections",
    "10": "Directors, Executive Officers and Corporate Governance",
    "11": "Executive Compensation",
    "12": "Security Ownership of Certain Beneficial Owners and Management and Related Stockholder Matters",
    "13": "Certain Relationships and Related Transactions, and Director Independence",
    "14": "Principal Accountant Fees and Services",
    "15": "Exhibits, Financial Statement Schedules",
    "16": "Form 10-K Summary"
}

ITEM_NAME_MAP_10Q_PART_I = {
    "1": "Financial Statements",
    "2": "Management's Discussion and Analysis of Financial Condition and Results of Operations",
    "3": "Quantitative and Qualitative Disclosures About Market Risk",
    "4": "Controls and Procedures",
}

ITEM_NAME_MAP_10Q_PART_II = {
    "1": "Legal Proceedings", "1A": "Risk Factors",
    "2": "Unregistered Sales of Equity Securities and Use of Proceeds",
    "3": "Defaults Upon Senior Securities", "4": "Mine Safety Disclosures",
    "5": "Other Information", "6": "Exhibits",
}

# =============================================================================
# 2. DATA STRUCTURES FOR BETTER ORGANIZATION
# =============================================================================

@dataclass
class FilingMetadata:
    """Structured metadata for a filing"""
    ticker: str
    form_type: str
    filing_date: str
    fiscal_year: int
    fiscal_quarter: int
    file_path: str

@dataclass
class DocumentSection:
    """Represents a section of the document"""
    title: str
    content: str
    section_type: str  # 'item', 'part', 'intro', 'table'
    item_number: Optional[str] = None
    part: Optional[str] = None
    start_pos: int = 0
    end_pos: int = 0

@dataclass
class Chunk:
    """Final chunk with all metadata"""
    chunk_id: str
    text: str
    token_count: int
    chunk_type: str  # 'narrative', 'table', 'mixed'
    section_info: str
    filing_metadata: FilingMetadata
    chunk_index: int
    has_overlap: bool = False

# =============================================================================
# 3. ROBUST TEXT CLEANING
# =============================================================================

def clean_sec_text(text: str) -> str:
    """
    Clean SEC filing text more robustly
    """
    # Remove common SEC artifacts
    text = re.sub(r'UNITED STATES\s+SECURITIES AND EXCHANGE COMMISSION.*?FORM \d+[A-Z]*', '', text, flags=re.DOTALL | re.IGNORECASE)

    # Handle page breaks more intelligently
    text = text.replace('[PAGE BREAK]', '\n\n--- PAGE BREAK ---\n\n')

    # Preserve table boundaries but clean them up
    text = re.sub(r'\[TABLE_START\]', '\n\n=== TABLE START ===\n', text)
    text = re.sub(r'\[TABLE_END\]', '\n=== TABLE END ===\n\n', text)

    # Clean up excessive whitespace but preserve paragraph structure
    text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text)  # Multiple newlines -> double newline
    text = re.sub(r'[ \t]+', ' ', text)  # Multiple spaces/tabs -> single space
    text = re.sub(r'^\s+|\s+$', '', text, flags=re.MULTILINE)  # Trim lines

    return text.strip()

# =============================================================================
# 4. MULTI-STRATEGY SECTION DETECTION
# =============================================================================

def detect_sections_strategy_1_improved(content: str) -> List[DocumentSection]:
    """
    Improved Strategy 1: Patterns based on real SEC filing structure
    """
    sections = []

    # Much more comprehensive patterns based on your actual files
    patterns = [
        # PART patterns - handle various formats
        re.compile(r'^\s*PART\s+([IVX]+)(?:\s*[-–—].*?)?$', re.I | re.M),
        re.compile(r'^PART\s+([IVX]+)(?:\s*[-–—].*?)?$', re.I | re.M),

        # ITEM patterns - much more flexible
        # escape hyphen-minus at end of class so it's not seen as a range
        re.compile(r'^\s*ITEM\s+(\d{1,2}[A-C]?)(?:[.\s–—-])', re.I | re.M),
        re.compile(r'^ITEM\s+(\d{1,2}[A-C]?)(?:[.\s–—-])', re.I | re.M),
        re.compile(r'Item\s+(\d{1,2}[A-C]?)(?:[.\s–—-])', re.I | re.M),

        # Number-dot format common in SEC filings
        re.compile(r'^(\d{1,2}[A-C]?)\.\s+[A-Z][A-Za-z\s]{10,}', re.I | re.M),

        # Content-based patterns for known sections
        re.compile(r'^.{0,50}(BUSINESS)\s*$', re.I | re.M),
        re.compile(r'^.{0,50}(RISK FACTORS)\s*$', re.I | re.M),
        re.compile(r'^.{0,50}(LEGAL PROCEEDINGS)\s*$', re.I | re.M),
        re.compile(r'^.{0,50}(FINANCIAL STATEMENTS)\s*$', re.I | re.M),
        re.compile(r'^.{0,50}(MANAGEMENT.S DISCUSSION)\s*', re.I | re.M),
        re.compile(r'^.{0,50}(PROPERTIES)\s*$', re.I | re.M),
        re.compile(r'^.{0,50}(CONTROLS AND PROCEDURES)\s*$', re.I | re.M),
    ]

    all_matches = []

    for pattern_idx, pattern in enumerate(patterns):
        for match in pattern.finditer(content):
            line_start = content.rfind('\n', 0, match.start()) + 1
            line_end = content.find('\n', match.end())
            if line_end == -1:
                line_end = len(content)

            full_line = content[line_start:line_end].strip()

            if (len(full_line) > 400 or
                len(full_line) < 3 or
                ('TABLE' in full_line.upper() and ('START' in full_line.upper() or 'END' in full_line.upper())) or
                full_line.count(' ') > 20):
                continue

            if any(toc_indicator in full_line.lower() for toc_indicator in ['table of contents', 'index']):
                continue
            
            section_id = None
            section_title = full_line

            groups = match.groups()
            if groups:
                potential_id = groups[0].strip()
                is_item_id = re.match(r'^\d+[A-C]?$', potential_id, re.I)
                is_part_id = re.match(r'^[IVX]+$', potential_id, re.I)

                if is_item_id or is_part_id:
                    section_id = potential_id
                    if len(groups) > 1 and groups[1]:
                        section_title = groups[1].strip()
                        section_title = re.sub(r'\[TABLE_END\]\s*.*', '', section_title, flags=re.I).strip()
                        section_title = section_title.replace('|', '').strip()
                    else:
                        remaining_line_after_id = full_line[match.end() - line_start:].strip()
                        clean_line = re.sub(r'^\s*\.?\s*[-–—]?\s*', '', remaining_line_after_id).strip()
                        if clean_line and len(clean_line) < 200:
                            section_title = clean_line
                        else:
                             section_title = full_line
                else:
                    section_title = full_line
                    if 'BUSINESS' in full_line.upper() and not is_item_id and not is_part_id: section_id = '1'
                    elif 'RISK FACTORS' in full_line.upper() and not is_item_id and not is_part_id: section_id = '1A'

            all_matches.append({
                'start_pos': match.start(),
                'end_pos': match.end(),
                'full_line': full_line,
                'section_id': section_id if section_id else 'unknown',
                'section_title': section_title,
                'pattern_idx': pattern_idx,
                'match_start': match.start()
            })

    all_matches.sort(key=lambda x: (x['start_pos'], x['pattern_idx']))

    final_matches = []
    if all_matches:
        final_matches.append(all_matches[0])
        for i in range(1, len(all_matches)):
            current_match = all_matches[i]
            last_added_match = final_matches[-1]

            if current_match['start_pos'] - last_added_match['start_pos'] < 100:
                if current_match['section_id'] != 'unknown' and last_added_match['section_id'] == 'unknown':
                    final_matches[-1] = current_match
                elif current_match['section_id'] != 'unknown' and last_added_match['section_id'] != 'unknown' and current_match['pattern_idx'] < last_added_match['pattern_idx']:
                    final_matches[-1] = current_match
                elif current_match['section_id'] == last_added_match['section_id'] and len(current_match['section_title']) < len(last_added_match['section_title']) * 0.8:
                     final_matches[-1] = current_match
            else:
                final_matches.append(current_match)

    print(f"🔍 Improved detection found {len(unique_matches)} potential sections:")
    for i, match in enumerate(unique_matches[:15]):
        print(f"  {i+1}: {match['full_line'][:80]}...")

    # Convert to DocumentSection objects
    for i, match in enumerate(unique_matches):
        start_pos = match['start_pos']
        end_pos = unique_matches[i + 1]['start_pos'] if i + 1 < len(unique_matches) else len(content)

        section_content = content[start_pos:end_pos].strip()

        full_line_upper = match['full_line'].upper()
        section_id = match['section_id'].upper() if match['section_id'] != 'unknown' else None

        if 'PART' in full_line_upper and section_id:
            section_type = 'part'
            part = f"PART {section_id}"
            item_number = None
            title = f"Part {section_id}"
        elif ('ITEM' in full_line_upper or re.match(r'^\d+[A-C]?$', str(section_id))) and section_id:
            section_type = 'item'
            part = None
            item_number = section_id
            title = f"Item {section_id}"
        elif any(keyword in full_line_upper for keyword in
                ['BUSINESS', 'RISK', 'LEGAL', 'FINANCIAL', 'MANAGEMENT', 'PROPERTIES', 'CONTROLS']):
            section_type = 'named_section'
            part = None
            item_number = None
            title = match['full_line']
        else:
            section_type = 'content'
            part = None
            item_number = None
            title = match['full_line']

        sections.append(DocumentSection(
            title=title,
            content=section_content,
            section_type=section_type,
            item_number=item_number,
            part=part,
            start_pos=start_pos,
            end_pos=end_pos
        ))

    return sections

def detect_sections_strategy_2(content: str) -> List[DocumentSection]:
    """
    Strategy 2: Fallback using page breaks and heuristics
    """
    sections = []

    pages = content.split('--- PAGE BREAK ---')

    current_section = ""
    current_title = "Document Content"

    for i, page in enumerate(pages):
        page = page.strip()
        if not page:
            continue

        lines = page.split('\n')
        potential_headers = []

        for j, line in enumerate(lines[:10]):
            line = line.strip()
            if (len(line) < 100 and
                (re.search(r'\b(ITEM|PART)\b', line, re.IGNORECASE) or
                 re.search(r'\b(BUSINESS|RISK FACTORS|FINANCIAL STATEMENTS)\b', line, re.IGNORECASE))):
                potential_headers.append((j, line))

        if potential_headers:
            if current_section:
                sections.append(DocumentSection(
                    title=current_title,
                    content=current_section.strip(),
                    section_type='content',
                    start_pos=0, # These positions are relative to the 'page' or current_section, not whole document
                    end_pos=len(current_section)
                ))

            current_title = potential_headers[0][1]
            current_section = page
        else:
            current_section += "\n\n" + page

    if current_section:
        sections.append(DocumentSection(
            title=current_title,
            content=current_section.strip(),
            section_type='content',
            start_pos=0,
            end_pos=len(current_section)
        ))

    return sections

def detect_sections_robust_old(content: str) -> List[DocumentSection]:
    """
    Multi-strategy section detection with fallbacks (original version)
    """
    logger.info("Attempting Strategy 1: Regex-based section detection")
    sections = detect_sections_strategy_1_improved(content)

    if len(sections) >= 3:
        logger.info(f"Strategy 1 successful: Found {len(sections)} sections")
        return sections

    logger.warning("Strategy 1 failed, trying Strategy 2: Page-based detection")
    sections = detect_sections_strategy_2(content)

    if len(sections) >= 2:
        logger.info(f"Strategy 2 successful: Found {len(sections)} sections")
        return sections

    logger.warning("All strategies failed, creating single section")
    return [DocumentSection(
        title="Full Document",
        content=content,
        section_type='document',
        start_pos=0,
        end_pos=len(content)
    )]

def create_section_info(section: DocumentSection, form_type: str) -> str:
    """
    Create human-readable section information for DocumentSection objects,
    using form_type to select the correct item name map.
    Handles 10K/10Q specific mappings and part/item inheritance.
    """
    item_number = section.item_number
    section_type = section.section_type
    part_number = section.part

    if section_type == 'item' and item_number:
        if form_type == '10K':
            item_name = ITEM_NAME_MAP_10K.get(item_number, "Unknown Section")
            return f"Item {item_number} - {item_name}"
        elif form_type == '10Q':
            if part_number == 'PART I':
                item_name = ITEM_NAME_MAP_10Q_PART_I.get(item_number, "Unknown Section")
                return f"Part I, Item {item_number} - {item_name}"
            elif part_number == 'PART II':
                item_name = ITEM_NAME_MAP_10Q_PART_II.get(item_number, "Unknown Section")
                return f"Part II, Item {item_number} - {item_name}"
            else: # Fallback if part not explicitly set for 10Q item
                if item_number in ITEM_NAME_MAP_10Q_PART_I:
                    item_name = ITEM_NAME_MAP_10Q_PART_I[item_number]
                    return f"Part I, Item {item_number} - {item_name}"
                elif item_number in ITEM_NAME_MAP_10Q_PART_II:
                    item_name = ITEM_NAME_MAP_10Q_PART_II[item_number]
                    return f"Part II, Item {item_number} - {item_name}"
                return f"Item {item_number} - Unknown 10Q Section"
    
    elif section_type == 'part' and part_number:
        # If it's a PART section itself, format it.
        if "Item" in section.title and section.item_number:
            clean_title_suffix = section.title.replace(part_number, '').strip(' -.')
            return f"{part_number} - {clean_title_suffix}"
        return part_number

    # Fallback for named_section, content, or document type sections
    return section.title or "Document Content"


def detect_sections_universal_sec(content: str) -> List[DocumentSection]:
    """
    Universal section detection for SEC filings with table-based formatting.
    Improved regex patterns for better capture of Item/Part numbers and titles.
    Ensures content for each DocumentSection is correctly sliced.
    """
    sections = []

    if not content:
        logger.info("Empty content provided to detect_sections_universal_sec. Returning empty sections.")
        return sections

    patterns = [
        re.compile(r'(?i)\[TABLE_START\]\s*Item\s*(\d{1,2}[A-C]?)\.?\s*\|\s*([^\[]+?)\s*\[TABLE_END\]', re.DOTALL),
        re.compile(r'(?i)\[TABLE_START\]\s*Item\s*(\d{1,2}[A-C]?)\.?\s*\|\s*([^|]+)', re.DOTALL),
        re.compile(r'(?i)\[TABLE_START\]\s*PART\s*([IVX]+)\s*\|\s*([^\[]+?)\s*\[TABLE_END\]', re.DOTALL),
        re.compile(r'(?i)\[TABLE_START\]\s*PART\s*([IVX]+)\s*\|\s*([^|]+)', re.DOTALL),
        re.compile(r'(?i)\[TABLE_START\]\s*PART\s*([IVX]+)\s*\[TABLE_END\]', re.DOTALL),
        re.compile(r'^\s*Item\s*(\d{1,2}[A-C]?)\.?\s*([^\n]+)', re.I | re.M),
        re.compile(r'Item\s*(\d{1,2}[A-C]?)\.?\s*\|\s*([^|]+)', re.I | re.DOTALL),
        re.compile(r'^\s*PART\s*([IVX]+)\.?\s*([^\n]*)', re.I | re.M),
        re.compile(r'PART\s*([IVX]+)\s*\|\s*([^|]+)', re.I | re.DOTALL),
        re.compile(r'^\s*(\d{1,2}[A-C]?)\.\s+[A-Z][A-Za-z\s]{10,}', re.I | re.M),
        re.compile(r'(?i)\[TABLE_START\]\s*(\d{1,2}[A-C]?)\.?\s*\|\s*([^|]+)', re.I | re.DOTALL),
        re.compile(r'^\s*(BUSINESS|RISK FACTORS|LEGAL PROCEEDINGS|FINANCIAL STATEMENTS|MANAGEMENT\'S DISCUSSION AND ANALYSIS OF FINANCIAL CONDITION AND RESULTS OF OPERATIONS|PROPERTIES|CONTROLS AND PROCEDURES)\s*$', re.I | re.M)
    ]

    all_matches = []

    for pattern_idx, pattern in enumerate(patterns):
        for match in pattern.finditer(content):
            line_start = content.rfind('\n', 0, match.start()) + 1
            line_end = content.find('\n', match.end())
            if line_end == -1:
                line_end = len(content)

            full_line = content[line_start:line_end].strip()

            if (len(full_line) > 400 or
                len(full_line) < 3 or
                ('TABLE' in full_line.upper() and ('START' in full_line.upper() or 'END' in full_line.upper())) or
                full_line.count(' ') > 20):
                continue

            if any(toc_indicator in full_line.lower() for toc_indicator in ['table of contents', 'index']):
                continue
            
            section_id = None
            section_title = full_line

            groups = match.groups()
            if groups:
                potential_id = groups[0].strip()
                is_item_id = re.match(r'^\d+[A-C]?$', potential_id, re.I)
                is_part_id = re.match(r'^[IVX]+$', potential_id, re.I)

                if is_item_id or is_part_id:
                    section_id = potential_id
                    if len(groups) > 1 and groups[1]:
                        section_title = groups[1].strip()
                        section_title = re.sub(r'\[TABLE_END\]\s*.*', '', section_title, flags=re.I).strip()
                        section_title = section_title.replace('|', '').strip()
                    else:
                        remaining_line_after_id = full_line[match.end() - line_start:].strip()
                        clean_line = re.sub(r'^\s*\.?\s*[-–—]?\s*', '', remaining_line_after_id).strip()
                        if clean_line and len(clean_line) < 200:
                            section_title = clean_line
                        else:
                             section_title = full_line
                else:
                    section_title = full_line
                    if 'BUSINESS' in full_line.upper() and not is_item_id and not is_part_id: section_id = '1'
                    elif 'RISK FACTORS' in full_line.upper() and not is_item_id and not is_part_id: section_id = '1A'

            all_matches.append({
                'start_pos': match.start(),
                'end_pos': match.end(),
                'full_line': full_line,
                'section_id': section_id if section_id else 'unknown',
                'section_title': section_title,
                'pattern_idx': pattern_idx,
                'match_start': match.start()
            })

    all_matches.sort(key=lambda x: (x['start_pos'], x['pattern_idx']))

    final_matches = []
    if all_matches:
        final_matches.append(all_matches[0])
        for i in range(1, len(all_matches)):
            current_match = all_matches[i]
            last_added_match = final_matches[-1]

            if current_match['start_pos'] - last_added_match['start_pos'] < 100:
                if current_match['section_id'] != 'unknown' and last_added_match['section_id'] == 'unknown':
                    final_matches[-1] = current_match
                elif current_match['section_id'] != 'unknown' and last_added_match['section_id'] != 'unknown' and current_match['pattern_idx'] < last_added_match['pattern_idx']:
                    final_matches[-1] = current_match
                elif current_match['section_id'] == last_added_match['section_id'] and len(current_match['section_title']) < len(last_added_match['section_title']) * 0.8:
                     final_matches[-1] = current_match
            else:
                final_matches.append(current_match)

    logger.info(f"🔍 Universal SEC detection found {len(final_matches)} unique sections:")
    for i, match in enumerate(final_matches[:15]):
        logger.info(f"  {i+1}: Item/Part {match['section_id']} - {match['section_title'][:60]}...")

    final_document_sections = []
    current_part = None

    for i, match in enumerate(final_matches):
        start_pos = match['start_pos']
        end_pos = final_matches[i + 1]['start_pos'] if i + 1 < len(final_matches) else len(content)

        section_content = content[start_pos:end_pos].strip()

        section_id = match['section_id'].upper()
        title = match['section_title']

        section_type = 'content'
        item_number = None
        part = None

        if re.match(r'^[IVX]+$', section_id):
            section_type = 'part'
            part = f"PART {section_id}"
            current_part = part
            clean_title_part = title.upper().replace(part, '').strip(' -.')
            if clean_title_part:
                title = f"{part} - {clean_title_part}"
            else:
                title = part
        elif re.match(r'^\d+[A-C]?$', section_id):
            section_type = 'item'
            item_number = section_id
            part = current_part
            clean_title_item = title.upper().replace(f"ITEM {item_number}", '').strip(' -.')
            if clean_title_item:
                title = f"Item {item_number} - {clean_title_item}"
            else:
                title = f"Item {item_number}"
        elif any(keyword in title.upper() for keyword in ['BUSINESS', 'RISK FACTORS', 'LEGAL PROCEEDINGS', 'FINANCIAL STATEMENTS', 'MANAGEMENT\'S DISCUSSION', 'PROPERTIES', 'CONTROLS AND PROCEDURES']):
            section_type = 'named_section'


        final_document_sections.append(DocumentSection(
            title=title,
            content=section_content, # Pass the correctly sliced content
            section_type=section_type,
            item_number=item_number,
            part=part,
            start_pos=start_pos,
            end_pos=end_pos
        ))

    return final_document_sections

def detect_sections_from_toc_universal(content: str) -> List[DocumentSection]:
    """
    Extract sections from table of contents - works for any SEC filing.
    This function primarily identifies section titles and item numbers from TOC,
    but does not extract their content directly.
    """
    sections = []

    if not content:
        logger.info("Empty content provided to detect_sections_from_toc_universal. Returning empty sections.")
        return sections

    toc_patterns = [
        re.compile(r'(?i)INDEX.*?(?=\s*--- PAGE BREAK ---)', re.DOTALL),
        re.compile(r'(?i)TABLE OF CONTENTS.*?(?=\s*--- PAGE BREAK ---)', re.DOTALL),
        re.compile(r'(?i)FORM 10-[KQ].*?INDEX.*?(?=\s*--- PAGE BREAK ---)', re.DOTALL),
        re.compile(re.escape('[TABLE_START]') + r'.*?Page.*?' + re.escape('[TABLE_END]') + r'.*?(?=\s*--- PAGE BREAK ---)', re.DOTALL),
    ]

    toc_content = ""
    for pattern in toc_patterns:
        match = pattern.search(content)
        if match:
            toc_content = match.group(0)
            break

    if not toc_content:
        logger.warning("No table of contents found in detect_sections_from_toc_universal.")
        return sections

    logger.info(f"Found table of contents ({len(toc_content)} chars)")

    item_patterns = [
        # Pattern 1: Multi-column TOC entry with PART, Item, and Title (e.g., KO 10-Q). Very specific.
        # Group 1: Optional Page Num, Group 2: PART ID, Group 3: PART Title, Group 4: Item ID, Group 5: Item Title
        re.compile(r'(?i)(?:Page\s*\|\s*)?\s*(PART\s*([IVX]+)\.?(?:\s*([^\n|]+?))?\s*\|\s*)?Item\s*(\d{1,2}[A-C]?)\.?\s*\|\s*([^|]+?)(?:\s*\|\s*\d+)?', re.M),
        
        # Pattern 2: Simpler Item/Part line with Title, pipe-separated. Catches "Item 1. | Financial Statements | 3"
        # Group 1: Item/PART ID, Group 2: Title
        re.compile(r'(?i)(?:Item|PART)\s*(\d{1,2}[A-C]?|[IVX]+)\.?\s*\|\s*([^\n|]+?)(?:\s*\|\s*\d+)?', re.M),
        
        # Pattern 3: Standalone Item/Part line with Title (no pipes separating title)
        # Group 1: Item/PART ID, Group 2: Title
        re.compile(r'(?i)^\s*(?:Item|PART)\s*(\d{1,2}[A-C]?|[IVX]+)\.?\s*([^\n|]+)', re.M),
        
        # Pattern 4: Generic TOC titles, often sub-sections or long descriptions. Must be long enough, starts with capital.
        # Group 1: Title
        re.compile(r'^\s*([A-Z][A-Za-z0-9\s\',&\(\)\-\.]{15,})\s*(?:\|\s*\d+)?$', re.M),
        
        # Pattern 5: Simple "PART X" line
        # Group 1: PART ID
        re.compile(r'(?i)^\s*PART\s*([IVX]+)\s*$', re.M),
        
        # Pattern 6: Number-dot format (e.g., "1. Business") usually at start of line
        # Group 1: Item ID, Group 2: Title
        re.compile(r'^\s*(\d{1,2}[A-C]?)\.\s*([^\n|]+)', re.M),
    ]

    found_items = []
    current_part_id_context = None

    if toc_content:
        for line in toc_content.split('\n'):
            line = line.strip()
            if not line:
                continue
            
            # Strict filtering of TOC lines to remove noise
            if any(kw in line.lower() for kw in ['page', 'description', 'signatures']) and len(line) < 30:
                continue
            if re.match(r'^\s*\d+\s*$', line.strip()): # Just a page number
                continue
            if re.match(r'^\s*(\d{1,2}[A-C]?)\s*$', line.strip()): # Just "1" or "1A"
                continue
            if len(line) < 5: # Very short lines
                continue
            if 'total' in line.lower() and re.search(r'\d', line): # Lines with numbers that look like financial totals
                continue


            for pattern in item_patterns:
                match = pattern.search(line)
                if match:
                    item_id = None
                    item_title = ""
                    section_type_raw = 'unknown'
                    
                    if pattern == item_patterns[0]: # Pattern 1: Complex multi-column TOC
                        # The groups need careful mapping based on the regex
                        # (?:Page\s*\|\s*)?\s*(PART\s*([IVX]+)\.?(?:\s*([^\n|]+?))?\s*\|\s*)?Item\s*(\d{1,2}[A-C]?)\.?\s*\|\s*([^|]+?)(?:\s*\|\s*\d+)?
                        # Match.groups() will return a tuple containing all captured groups in order,
                        # including None for optional groups that didn't match.
                        # Need to adjust group indices based on how pattern is built.
                        
                        # The regex provided for Pattern 1 has 5 capturing groups:
                        # G1: PART\s*([IVX]+) -> ([IVX]+) -> part_id_cand
                        # G2: (\s*([^\n|]+?)) -> ([^\n|]+?) -> part_title_from_group
                        # G3: (\d{1,2}[A-C]?) -> item_id
                        # G4: ([^|]+?) -> item_title
                        
                        # Let's adjust group access based on actual match.groups() output
                        # (None, 'I', 'FINANCIAL INFORMATION', '1', 'Financial Statements', None) for AMZN.
                        # Or (None, None, None, '1', 'Business', None) for some K.
                        
                        # Refined group assignment
                        part_id_idx = match.re.groupindex.get('part_id_g', 0) # Use groupindex if named groups
                        part_title_idx = match.re.groupindex.get('part_title_g', 0)
                        item_id_idx = match.re.groupindex.get('item_id_g', 0)
                        item_title_idx = match.re.groupindex.get('item_title_g', 0)

                        # Simpler direct access based on fixed order of groups as provided
                        part_id_cand = match.group(2) if len(groups) >= 2 else None
                        part_title_from_group = match.group(3) if len(groups) >= 3 else None
                        item_id = match.group(4) if len(groups) >= 4 else None
                        item_title = match.group(5) if len(groups) >= 5 else None

                        if part_id_cand:
                            current_part_id_context = f"PART {part_id_cand.strip()}"
                            title_for_part = part_title_from_group.strip() if part_title_from_group else f"PART {part_id_cand.strip()}"
                            found_items.append((part_id_cand.strip(), title_for_part, 'part', current_part_id_context))
                        
                        if item_id:
                            section_type_raw = 'item'
                            title_for_item = item_title.strip() if item_title else f"Item {item_id.strip()}"
                            found_items.append((item_id.strip(), title_for_item, section_type_raw, current_part_id_context))
                            break # Move to next line (matched)

                    elif pattern in [item_patterns[1], item_patterns[2], item_patterns[5]]: # Patterns with ID as group 1, Title as group 2 (or inferred from line)
                        item_id = match.group(1).strip() if match.group(1) else None
                        item_title = match.group(2).strip() if len(match.groups()) > 1 and match.group(2) else ""

                        is_item = re.match(r'^\d+[A-C]?$', item_id, re.I)
                        is_part = re.match(r'^[IVX]+$', item_id, re.I)

                        if is_item:
                            section_type_raw = 'item'
                            found_items.append((item_id, item_title, section_type_raw, current_part_id_context))
                            break
                        elif is_part:
                            section_type_raw = 'part'
                            current_part_id_context = f"PART {item_id}"
                            found_items.append((item_id, item_title, section_type_raw, current_part_id_context))
                            break
                    
                    elif pattern == item_patterns[3]: # Generic titles (Pattern 4: e.g., "Consolidated Statements of Cash Flows")
                        item_title = match.group(1).strip()
                        if item_title and len(item_title) > 10 and not re.match(r'^\d+(\.\d+)?$', item_title.replace('.', '').strip()):
                             found_items.append((None, item_title, 'named_section', current_part_id_context))
                             break
                    
                    elif pattern == item_patterns[4]: # Simple "PART X" line (Pattern 5)
                        item_id = match.group(1).strip()
                        current_part_id_context = f"PART {item_id}"
                        found_items.append((item_id, f"PART {item_id}", 'part', current_part_id_context))
                        break

    unique_items = []
    seen_keys = set()
    
    processed_items_for_dedup = []
    for item_data in found_items:
        item_id, title_raw, section_type_raw, part_context = item_data
        
        cleaned_title = re.sub(r'\|\s*\d+\s*$', '', title_raw).strip()
        cleaned_title = re.sub(r'\s*\.\s*$', '', cleaned_title).strip()
        cleaned_title = re.sub(r'\[TABLE_END\]\s*.*', '', cleaned_title, flags=re.I).strip()
        cleaned_title = re.sub(r'\s+', ' ', cleaned_title).strip()
        
        if not cleaned_title or len(cleaned_title) < 5 or re.match(r'^\d+(\.\d+)?$', cleaned_title):
            continue

        processed_items_for_dedup.append({
            'item_id': item_id,
            'title': cleaned_title,
            'type': section_type_raw,
            'part': part_context
        })

    processed_items_for_dedup.sort(key=lambda x: (x['part'] if x['part'] else '', x['item_id'] if x['item_id'] else '', x['title']))

    for item in processed_items_for_dedup:
        key = (item['item_id'], item['title'], item['type'], item['part'])
        if key not in seen_keys:
            unique_items.append(DocumentSection(
                title=item['title'],
                content="",
                section_type=item['type'],
                item_number=item['item_id'] if item['type'] == 'item' else None,
                part=item['part'],
                start_pos=0,
                end_pos=0
            ))
            seen_keys.add(key)
    
    logger.info(f"Extracted {len(unique_items)} sections from table of contents:")
    for i, sec in enumerate(unique_items[:15]):
        logger.info(f"  • ID: {sec.item_number if sec.item_number else sec.part if sec.part else 'None'}, Type: {sec.section_type}, Title: {sec.title[:60]}...")

    return unique_items


def detect_sections_robust_universal(content: str) -> List[DocumentSection]:
    """
    Universal robust section detection for all SEC filings.
    Prioritizes direct pattern matching (which handles tables well), then TOC, then page-based.
    """
    logger.info("Attempting universal SEC section detection")

    sections_strategy1 = detect_sections_universal_sec(content)

    if len(sections_strategy1) >= 3:
        logger.info(f"Universal detection successful (Strategy 1): Found {len(sections_strategy1)} sections.")
        return sections_strategy1

    logger.warning("Direct detection found few sections, analyzing table of contents.")
    toc_entries = detect_sections_from_toc_universal(content)

    if toc_entries and len(toc_entries) >= 3:
        logger.info(f"TOC analysis found {len(toc_entries)} potential sections. Attempting to extract content based on TOC titles.")

        combined_sections = []
        current_content_pos = 0

        # TOC entries are already sorted by `detect_sections_from_toc_universal`

        for i, toc_entry in enumerate(toc_entries):
            pattern_parts = []
            
            if toc_entry.item_number:
                pattern_parts.append(r'Item\s*' + re.escape(toc_entry.item_number) + r'\.?')
            if toc_entry.part and toc_entry.part.startswith("PART "):
                pattern_parts.append(r'PART\s*' + re.escape(toc_entry.part.replace("PART ", "")) + r'\.?')
            
            if toc_entry.title:
                cleaned_title_for_regex = re.sub(r'\|\s*\d+', '', toc_entry.title).strip()
                cleaned_title_for_regex = re.sub(r'\s*\.\s*$', '', cleaned_title_for_regex).strip()
                cleaned_title_for_regex = re.sub(r'\s+-\s+', r'\s*[-–—]?\s*', cleaned_title_for_regex)
                cleaned_title_for_regex = re.sub(r'\s+', r'\s+', cleaned_title_for_regex)
                
                if len(cleaned_title_for_regex) > 5:
                    pattern_parts.append(r'\b?' + re.escape(cleaned_title_for_regex) + r'\b?')
                else:
                    pattern_parts.append(re.escape(cleaned_title_for_regex))
                
            if not pattern_parts:
                logger.warning(f"No valid pattern parts for TOC entry: '{toc_entry.title}'. Skipping.")
                continue

            search_pattern = re.compile(r'(?i)^\s*(?:' + '|'.join(pattern_parts) + r')', re.M)
            
            match = search_pattern.search(content, pos=current_content_pos)

            if match:
                start_pos = match.start()
                
                next_start_pos = len(content)
                if i + 1 < len(toc_entries):
                    next_toc_entry = toc_entries[i+1]
                    next_pattern_parts = []
                    if next_toc_entry.item_number:
                        next_pattern_parts.append(r'Item\s*' + re.escape(next_toc_entry.item_number) + r'\.?')
                    elif next_toc_entry.part and next_toc_entry.part.startswith("PART "):
                        next_pattern_parts.append(r'PART\s*' + re.escape(next_toc_entry.part.replace("PART ", "")) + r'\.?')
                    if next_toc_entry.title:
                        next_cleaned_title_for_regex = re.sub(r'\|\s*\d+', '', next_toc_entry.title).strip()
                        next_cleaned_title_for_regex = re.sub(r'\s*\.\s*$', '', next_cleaned_title_for_regex).strip()
                        next_cleaned_title_for_regex = re.sub(r'\s+-\s+', r'\s*[-–—]?\s*', next_cleaned_title_for_regex)
                        next_cleaned_title_for_regex = re.sub(r'\s+', r'\s+', next_cleaned_title_for_regex)
                        if len(next_cleaned_title_for_regex) > 5:
                            next_pattern_parts.append(r'\b?' + re.escape(next_cleaned_title_for_regex) + r'\b?')
                        else:
                            next_pattern_parts.append(re.escape(next_cleaned_title_for_regex))

                    if next_pattern_parts:
                        next_pattern = re.compile(r'(?i)^\s*(?:' + '|'.join(next_pattern_parts) + r')', re.M)
                        next_match = next_pattern.search(content, pos=match.end())
                        if next_match:
                            next_start_pos = next_match.start()
                
                section_content = content[start_pos:next_start_pos].strip()
                
                combined_sections.append(DocumentSection(
                    title=toc_entry.title,
                    content=section_content,
                    section_type=toc_entry.section_type,
                    item_number=toc_entry.item_number,
                    part=toc_entry.part,
                    start_pos=start_pos,
                    end_pos=next_start_pos
                ))
                current_content_pos = next_start_pos
            else:
                logger.warning(f"Could not find content for TOC entry: '{toc_entry.title}'. This section might be merged with previous or skipped.")

        if len(combined_sections) >= 3:
            logger.info(f"Universal detection successful (TOC-based content mapping): Found {len(combined_sections)} sections.")
            return combined_sections
        else:
            logger.warning("TOC-based content mapping yielded few sections. Falling back to page-based detection.")


    logger.warning("Trying page-based detection as fallback.")
    sections_strategy2 = detect_sections_strategy_2(content)

    if len(sections_strategy2) >= 2:
        logger.info(f"Page-based detection successful: Found {len(sections_strategy2)} sections.")
        return sections_strategy2

    logger.warning("All strategies failed, creating single section.")
    return [DocumentSection(
        title="Full Document",
        content=content,
        section_type='document',
        start_pos=0,
        end_pos=len(content)
    )]

In [19]:
results_universal = test_universal_detection_fixed()
old_vs_new_sections = compare_old_vs_universal_fixed()
quick_pattern_test_fixed()

INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 19 unique sections:
INFO:__main__:  1: Item/Part 1 - Business...
INFO:__main__:  2: Item/Part 1A - Risk Factors...
INFO:__main__:  3: Item/Part 1B - Unresolved Staff Comments...
INFO:__main__:  4: Item/Part 3 - Legal Proceedings...
INFO:__main__:  5: Item/Part 4 - Mine Safety Disclosures...
INFO:__main__:  6: Item/Part 5 - Market for Registrant’s Common Equity, Related Stockholder M...
INFO:__main__:  7: Item/Part 6 - Selected Financial Data...
INFO:__main__:  8: Item/Part 7 - Management’s Discussion and Analysis of Financial Condition ...
INFO:__main__:  9: Item/Part 7A - Quantitative and Qualitative Disclosures About Market Risk...
INFO:__main__:  10: Item/Part 8 - Financial Statements and Supplementary Data...
INFO:__main__:  11: Item/Part 9 - Changes in and Disagreements with Accountants on Accounting ...
INFO:__main__:  12: Item/Part 9A - Controls and Procedures...
INFO:__main__:


🧪 Testing: processed_filings/AAPL/AAPL_10K_2020-10-30.txt


✅ Found 19 sections:

  1. Item 1 - BUSINESS

     Type: item, Length: 13,266 chars

  2. Item 1A - RISK FACTORS

     Type: item, Length: 61,136 chars

  3. Item 1B - UNRESOLVED STAFF COMMENTS

     Type: item, Length: 582 chars

  4. Item 3 - LEGAL PROCEEDINGS

     Type: item, Length: 898 chars

  5. Item 4 - MINE SAFETY DISCLOSURES

     Type: item, Length: 108 chars

  6. Item 5 - MARKET FOR REGISTRANT’S COMMON EQUITY, RELATED STOCKHOLDER MATTERS AND ISSUER PURCHASES OF EQUITY SECURITIES

     Type: item, Length: 4,182 chars

  7. Item 6 - SELECTED FINANCIAL DATA

     Type: item, Length: 1,745 chars

  8. Item 7 - MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIAL CONDITION AND RESULTS OF OPERATIONS

     Type: item, Length: 33,154 chars

  9. Item 7A - QUANTITATIVE AND QUALITATIVE DISCLOSURES ABOUT MARKET RISK

     Type: item, Length: 6,799 chars

  10. Item 8 - FINANCIAL STATEMENTS AND SUPPLEMENTARY DATA

     Type: i

INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 0 unique sections:
INFO:__main__:Found table of contents (5004 chars)
INFO:__main__:Extracted 0 sections from table of contents:
INFO:__main__:Found 1 sections in KO_10Q_2020-07-22.txt
INFO:__main__:Created 161 chunks for KO_10Q_2020-07-22.txt
INFO:__main__:Attempting Strategy 1: Regex-based section detection



📊 Processing Results:

  total_chunks: 161

  avg_tokens: 396.7577639751553

  min_tokens: 32

  max_tokens: 1451

  chunks_with_overlap: 97

  table_chunks: 63

  narrative_chunks: 98

  unique_sections: 1


📚 Section Distribution (sample):

  • Full Document: 20 chunks


📊 UNIVERSAL DETECTION SUMMARY

AAPL_10K_2020-10-30.txt   | 19 sections | 172 chunks

AMZN_10K_2023-02-03.txt   | 21 sections |   0 chunks

AMZN_10Q_2024-11-01.txt   | 11 sections |   0 chunks

KO_10Q_2020-07-22.txt     |  8 sections | 161 chunks

⚖️ OLD vs UNIVERSAL Detection Comparison

Running old detection...



NameError: name 'unique_matches' is not defined

In [21]:
def detect_sections_strategy_1_improved(content: str) -> List[DocumentSection]:
    """
    Improved Strategy 1: Patterns based on real SEC filing structure
    """
    sections: List[DocumentSection] = []

    patterns = [
        # PART patterns
        re.compile(r'^\s*PART\s+([IVX]+)(?:\s*[-–—].*?)?$', re.I | re.M),
        re.compile(r'^PART\s+([IVX]+)(?:\s*[-–—].*?)?$', re.I | re.M),

        # ITEM patterns (hyphens escaped at end of class)
        re.compile(r'^\s*ITEM\s+(\d{1,2}[A-C]?)(?:[.\s–—-])', re.I | re.M),
        re.compile(r'^ITEM\s+(\d{1,2}[A-C]?)(?:[.\s–—-])', re.I | re.M),
        re.compile(r'Item\s+(\d{1,2}[A-C]?)(?:[.\s–—-])', re.I | re.M),

        # Number-dot format
        re.compile(r'^(\d{1,2}[A-C]?)\.\s+[A-Z][A-Za-z\s]{10,}', re.I | re.M),

        # Named sections
        re.compile(r'^.{0,50}\b(BUSINESS)\b\s*$', re.I | re.M),
        re.compile(r'^.{0,50}\b(RISK FACTORS)\b\s*$', re.I | re.M),
        re.compile(r'^.{0,50}\b(LEGAL PROCEEDINGS)\b\s*$', re.I | re.M),
        re.compile(r'^.{0,50}\b(FINANCIAL STATEMENTS)\b\s*$', re.I | re.M),
        re.compile(r'^.{0,50}\b(MANAGEMENT\.S DISCUSSION)\b', re.I | re.M),
        re.compile(r'^.{0,50}\b(PROPERTIES)\b\s*$', re.I | re.M),
        re.compile(r'^.{0,50}\b(CONTROLS AND PROCEDURES)\b\s*$', re.I | re.M),
    ]

    all_matches = []

    for idx, pattern in enumerate(patterns):
        for m in pattern.finditer(content):
            # extract full line
            line_start = content.rfind('\n', 0, m.start()) + 1
            line_end = content.find('\n', m.end())
            if line_end == -1:
                line_end = len(content)

            full_line = content[line_start:line_end].strip()

            # filter out obvious false positives
            if (len(full_line) > 400 or
                len(full_line) < 3 or
                ('TABLE' in full_line.upper() and ('START' in full_line.upper() or 'END' in full_line.upper())) or
                full_line.count(' ') > 20):
                continue
            if any(tok in full_line.lower() for tok in ['table of contents', 'index']):
                continue

            groups = m.groups()
            section_id = None
            section_title = full_line

            if groups:
                first = groups[0].strip()
                # item vs part
                if re.match(r'^\d+[A-C]?$', first, re.I):
                    section_id = first.upper()
                elif re.match(r'^[IVX]+$', first, re.I):
                    section_id = first.upper()

                # if there's a second group (named pattern), use it
                if len(groups) > 1 and groups[1]:
                    title = groups[1].strip()
                    title = re.sub(r'\[TABLE_END\].*$', '', title, flags=re.I).replace('|', '').strip()
                    if title:
                        section_title = title
                else:
                    # try to parse remainder of the line as title
                    rem = full_line[m.end() - line_start :].lstrip(" .–—-").strip()
                    if 0 < len(rem) < 200:
                        section_title = rem

            # fallback canonical IDs for pure-named sections
            if not section_id:
                up = full_line.upper()
                if 'BUSINESS' in up:
                    section_id = '1'
                elif 'RISK FACTORS' in up:
                    section_id = '1A'
                elif 'LEGAL PROCEEDINGS' in up:
                    section_id = '3'
                # add others if needed...

            all_matches.append({
                'start_pos': line_start,
                'end_pos': line_end,
                'section_id': section_id or 'UNKNOWN',
                'section_title': section_title,
            })

    # sort and dedupe by start_pos
    all_matches.sort(key=lambda x: x['start_pos'])
    unique = []
    seen_starts = set()
    for m in all_matches:
        if m['start_pos'] not in seen_starts:
            seen_starts.add(m['start_pos'])
            unique.append(m)

    # build DocumentSection list
    for i, m in enumerate(unique):
        start = m['start_pos']
        end = unique[i+1]['start_pos'] if i+1 < len(unique) else len(content)
        sections.append(DocumentSection(
            id=m['section_id'],
            title=m['section_title'],
            start_char=start,
            end_char=end
        ))

    logger.info(f"Strategy 1 found {len(sections)} sections")
    return sections

In [22]:
results_universal = test_universal_detection_fixed()
old_vs_new_sections = compare_old_vs_universal_fixed()
quick_pattern_test_fixed()

INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 19 unique sections:
INFO:__main__:  1: Item/Part 1 - Business...
INFO:__main__:  2: Item/Part 1A - Risk Factors...
INFO:__main__:  3: Item/Part 1B - Unresolved Staff Comments...
INFO:__main__:  4: Item/Part 3 - Legal Proceedings...
INFO:__main__:  5: Item/Part 4 - Mine Safety Disclosures...
INFO:__main__:  6: Item/Part 5 - Market for Registrant’s Common Equity, Related Stockholder M...
INFO:__main__:  7: Item/Part 6 - Selected Financial Data...
INFO:__main__:  8: Item/Part 7 - Management’s Discussion and Analysis of Financial Condition ...
INFO:__main__:  9: Item/Part 7A - Quantitative and Qualitative Disclosures About Market Risk...
INFO:__main__:  10: Item/Part 8 - Financial Statements and Supplementary Data...
INFO:__main__:  11: Item/Part 9 - Changes in and Disagreements with Accountants on Accounting ...
INFO:__main__:  12: Item/Part 9A - Controls and Procedures...
INFO:__main__:


🧪 Testing: processed_filings/AAPL/AAPL_10K_2020-10-30.txt


✅ Found 19 sections:

  1. Item 1 - BUSINESS

     Type: item, Length: 13,266 chars

  2. Item 1A - RISK FACTORS

     Type: item, Length: 61,136 chars

  3. Item 1B - UNRESOLVED STAFF COMMENTS

     Type: item, Length: 582 chars

  4. Item 3 - LEGAL PROCEEDINGS

     Type: item, Length: 898 chars

  5. Item 4 - MINE SAFETY DISCLOSURES

     Type: item, Length: 108 chars

  6. Item 5 - MARKET FOR REGISTRANT’S COMMON EQUITY, RELATED STOCKHOLDER MATTERS AND ISSUER PURCHASES OF EQUITY SECURITIES

     Type: item, Length: 4,182 chars

  7. Item 6 - SELECTED FINANCIAL DATA

     Type: item, Length: 1,745 chars

  8. Item 7 - MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIAL CONDITION AND RESULTS OF OPERATIONS

     Type: item, Length: 33,154 chars

  9. Item 7A - QUANTITATIVE AND QUALITATIVE DISCLOSURES ABOUT MARKET RISK

     Type: item, Length: 6,799 chars

  10. Item 8 - FINANCIAL STATEMENTS AND SUPPLEMENTARY DATA

     Type: i

INFO:__main__:🔍 Universal SEC detection found 0 unique sections:
INFO:__main__:Found table of contents (903 chars)
ERROR:__main__:Error processing processed_filings/AMZN/AMZN_10Q_2024-11-01.txt: name 'groups' is not defined
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 8 unique sections:
INFO:__main__:  1: Item/Part 1 - Financial Statements (Unaudited)...
INFO:__main__:  2: Item/Part 2 - Management's Discussion and Analysis of Financial Condition ...
INFO:__main__:  3: Item/Part 3 - Quantitative and Qualitative Disclosures About Market Risk...
INFO:__main__:  4: Item/Part 4 - Controls and Procedures...
INFO:__main__:  5: Item/Part 1 - Legal Proceedings...
INFO:__main__:  6: Item/Part 1A - Risk Factors...
INFO:__main__:  7: Item/Part 2 - Unregistered Sales of Equity Securities and Use of Proceeds...
INFO:__main__:  8: Item/Part 6 - Exhibits...
INFO:__main__:Universal detection successful (Strategy 1): Found 8 sections.
INFO:__main


📊 Processing Results:

  error: No chunks created


🧪 Testing: processed_filings/KO/KO_10Q_2020-07-22.txt


✅ Found 8 sections:

  1. Item 1 - FINANCIAL STATEMENTS (UNAUDITED)

     Type: item, Length: 115,893 chars

  2. Item 2 - MANAGEMENT'S DISCUSSION AND ANALYSIS OF FINANCIAL CONDITION AND RESULTS OF OPERATIONS

     Type: item, Length: 87,923 chars

  3. Item 3 - QUANTITATIVE AND QUALITATIVE DISCLOSURES ABOUT MARKET RISK

     Type: item, Length: 207 chars

  4. Item 4 - CONTROLS AND PROCEDURES

     Type: item, Length: 1,032 chars

  5. Item 1 - LEGAL PROCEEDINGS

     Type: item, Length: 220 chars

  6. Item 1A - RISK FACTORS

     Type: item, Length: 11,661 chars

  7. Item 2 - UNREGISTERED SALES OF EQUITY SECURITIES AND USE OF PROCEEDS

     Type: item, Length: 2,127 chars

  8. Item 6 - EXHIBITS

     Type: item, Length: 13,918 chars


📊 Processing Results:

  total_chunks: 161

  avg_tokens: 396.7577639751553

  min_tokens: 32

  max_tokens: 1451

  chunks_with_overlap: 97


TypeError: DocumentSection.__init__() got an unexpected keyword argument 'id'

In [None]:
def detect_sections_strategy_1_improved(content: str) -> List[DocumentSection]:
    """
    Improved Strategy 1: Patterns based on real SEC filing structure
    """
    sections: List[DocumentSection] = []

    patterns = [
        # PART patterns
        re.compile(r'^\s*PART\s+([IVX]+)(?:\s*[-–—].*?)?$', re.I | re.M),
        re.compile(r'^PART\s+([IVX]+)(?:\s*[-–—].*?)?$', re.I | re.M),

        # ITEM patterns
        re.compile(r'^\s*ITEM\s+(\d{1,2}[A-C]?)(?:[.\s–—-])', re.I | re.M),
        re.compile(r'^ITEM\s+(\d{1,2}[A-C]?)(?:[.\s–—-])', re.I | re.M),
        re.compile(r'Item\s+(\d{1,2}[A-C]?)(?:[.\s–—-])', re.I | re.M),

        # Number-dot format
        re.compile(r'^(\d{1,2}[A-C]?)\.\s+[A-Z][A-Za-z\s]{10,}', re.I | re.M),

        # Named sections
        re.compile(r'^.{0,50}\b(BUSINESS)\b\s*$', re.I | re.M),
        re.compile(r'^.{0,50}\b(RISK FACTORS)\b\s*$', re.I | re.M),
        re.compile(r'^.{0,50}\b(LEGAL PROCEEDINGS)\b\s*$', re.I | re.M),
        re.compile(r'^.{0,50}\b(FINANCIAL STATEMENTS)\b\s*$', re.I | re.M),
        re.compile(r'^.{0,50}\b(MANAGEMENT\.S DISCUSSION)\b', re.I | re.M),
        re.compile(r'^.{0,50}\b(PROPERTIES)\b\s*$', re.I | re.M),
        re.compile(r'^.{0,50}\b(CONTROLS AND PROCEDURES)\b\s*$', re.I | re.M),
    ]

    all_matches = []
    for idx, pattern in enumerate(patterns):
        for m in pattern.finditer(content):
            # grab the full line
            line_start = content.rfind('\n', 0, m.start()) + 1
            line_end = content.find('\n', m.end())
            if line_end == -1:
                line_end = len(content)

            full_line = content[line_start:line_end].strip()
            # filters
            if (len(full_line) > 400 or
                len(full_line) < 3 or
                ('TABLE' in full_line.upper() and ('START' in full_line.upper() or 'END' in full_line.upper())) or
                full_line.count(' ') > 20):
                continue
            if any(tok in full_line.lower() for tok in ['table of contents', 'index']):
                continue

            groups = m.groups()
            section_id = None
            section_title = full_line

            if groups:
                first = groups[0].strip()
                if re.match(r'^\d+[A-C]?$', first, re.I):
                    section_id = first.upper()
                elif re.match(r'^[IVX]+$', first, re.I):
                    section_id = first.upper()

                # named‐title group if captured
                if len(groups) > 1 and groups[1]:
                    title = groups[1].strip()
                    title = re.sub(r'\[TABLE_END\].*$', '', title, flags=re.I)
                    title = title.replace('|', '').strip()
                    if title:
                        section_title = title
                else:
                    # remainder of the line after the ID
                    rem = full_line[m.end() - line_start:].lstrip(" .–—-").strip()
                    if 0 < len(rem) < 200:
                        section_title = rem

            # fallback for pure‐named sections
            up = full_line.upper()
            if not section_id:
                if 'BUSINESS' in up:
                    section_id = '1'
                elif 'RISK FACTORS' in up:
                    section_id = '1A'
                elif 'LEGAL PROCEEDINGS' in up:
                    section_id = '3'

            all_matches.append({
                'start_pos': line_start,
                'end_pos': line_end,
                'section_id': section_id or 'UNKNOWN',
                'section_title': section_title,
            })

    # dedupe & sort
    all_matches.sort(key=lambda x: x['start_pos'])
    unique = []
    seen = set()
    for m in all_matches:
        if m['start_pos'] not in seen:
            seen.add(m['start_pos'])
            unique.append(m)

    # build your DocumentSection objects *positionally*
    for i, m in enumerate(unique):
        start = m['start_pos']
        end = unique[i+1]['start_pos'] if i+1 < len(unique) else len(content)
        # ** POSITIONAL args match your DocumentSection signature **
        sections.append(DocumentSection(
            m['section_id'],
            m['section_title'],
            start,
            end
        ))

    logger.info(f"Strategy 1 found {len(sections)} sections")
    return sections

In [23]:
results_universal = test_universal_detection_fixed()
old_vs_new_sections = compare_old_vs_universal_fixed()
quick_pattern_test_fixed()

INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 19 unique sections:
INFO:__main__:  1: Item/Part 1 - Business...
INFO:__main__:  2: Item/Part 1A - Risk Factors...
INFO:__main__:  3: Item/Part 1B - Unresolved Staff Comments...
INFO:__main__:  4: Item/Part 3 - Legal Proceedings...
INFO:__main__:  5: Item/Part 4 - Mine Safety Disclosures...
INFO:__main__:  6: Item/Part 5 - Market for Registrant’s Common Equity, Related Stockholder M...
INFO:__main__:  7: Item/Part 6 - Selected Financial Data...
INFO:__main__:  8: Item/Part 7 - Management’s Discussion and Analysis of Financial Condition ...
INFO:__main__:  9: Item/Part 7A - Quantitative and Qualitative Disclosures About Market Risk...
INFO:__main__:  10: Item/Part 8 - Financial Statements and Supplementary Data...
INFO:__main__:  11: Item/Part 9 - Changes in and Disagreements with Accountants on Accounting ...
INFO:__main__:  12: Item/Part 9A - Controls and Procedures...
INFO:__main__:


🧪 Testing: processed_filings/AAPL/AAPL_10K_2020-10-30.txt


✅ Found 19 sections:

  1. Item 1 - BUSINESS

     Type: item, Length: 13,266 chars

  2. Item 1A - RISK FACTORS

     Type: item, Length: 61,136 chars

  3. Item 1B - UNRESOLVED STAFF COMMENTS

     Type: item, Length: 582 chars

  4. Item 3 - LEGAL PROCEEDINGS

     Type: item, Length: 898 chars

  5. Item 4 - MINE SAFETY DISCLOSURES

     Type: item, Length: 108 chars

  6. Item 5 - MARKET FOR REGISTRANT’S COMMON EQUITY, RELATED STOCKHOLDER MATTERS AND ISSUER PURCHASES OF EQUITY SECURITIES

     Type: item, Length: 4,182 chars

  7. Item 6 - SELECTED FINANCIAL DATA

     Type: item, Length: 1,745 chars

  8. Item 7 - MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIAL CONDITION AND RESULTS OF OPERATIONS

     Type: item, Length: 33,154 chars

  9. Item 7A - QUANTITATIVE AND QUALITATIVE DISCLOSURES ABOUT MARKET RISK

     Type: item, Length: 6,799 chars

  10. Item 8 - FINANCIAL STATEMENTS AND SUPPLEMENTARY DATA

     Type: i

INFO:__main__:🔍 Universal SEC detection found 0 unique sections:
INFO:__main__:Found table of contents (903 chars)
ERROR:__main__:Error processing processed_filings/AMZN/AMZN_10Q_2024-11-01.txt: name 'groups' is not defined
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 8 unique sections:
INFO:__main__:  1: Item/Part 1 - Financial Statements (Unaudited)...
INFO:__main__:  2: Item/Part 2 - Management's Discussion and Analysis of Financial Condition ...
INFO:__main__:  3: Item/Part 3 - Quantitative and Qualitative Disclosures About Market Risk...
INFO:__main__:  4: Item/Part 4 - Controls and Procedures...
INFO:__main__:  5: Item/Part 1 - Legal Proceedings...
INFO:__main__:  6: Item/Part 1A - Risk Factors...
INFO:__main__:  7: Item/Part 2 - Unregistered Sales of Equity Securities and Use of Proceeds...
INFO:__main__:  8: Item/Part 6 - Exhibits...
INFO:__main__:Universal detection successful (Strategy 1): Found 8 sections.
INFO:__main


📊 Processing Results:

  error: No chunks created


🧪 Testing: processed_filings/KO/KO_10Q_2020-07-22.txt


✅ Found 8 sections:

  1. Item 1 - FINANCIAL STATEMENTS (UNAUDITED)

     Type: item, Length: 115,893 chars

  2. Item 2 - MANAGEMENT'S DISCUSSION AND ANALYSIS OF FINANCIAL CONDITION AND RESULTS OF OPERATIONS

     Type: item, Length: 87,923 chars

  3. Item 3 - QUANTITATIVE AND QUALITATIVE DISCLOSURES ABOUT MARKET RISK

     Type: item, Length: 207 chars

  4. Item 4 - CONTROLS AND PROCEDURES

     Type: item, Length: 1,032 chars

  5. Item 1 - LEGAL PROCEEDINGS

     Type: item, Length: 220 chars

  6. Item 1A - RISK FACTORS

     Type: item, Length: 11,661 chars

  7. Item 2 - UNREGISTERED SALES OF EQUITY SECURITIES AND USE OF PROCEEDS

     Type: item, Length: 2,127 chars

  8. Item 6 - EXHIBITS

     Type: item, Length: 13,918 chars


📊 Processing Results:

  total_chunks: 161

  avg_tokens: 396.7577639751553

  min_tokens: 32

  max_tokens: 1451

  chunks_with_overlap: 97


TypeError: DocumentSection.__init__() got an unexpected keyword argument 'id'

In [None]:
import re
import logging
from typing import List
from your_module import DocumentSection    # ← adjust this to wherever DocumentSection lives

logger = logging.getLogger(__name__)

def detect_sections_strategy_1_improved(content: str) -> List[DocumentSection]:
    """
    Improved Strategy 1: Patterns based on real SEC filing structure,
    with positional DocumentSection construction.
    """
    sections: List[DocumentSection] = []

    patterns = [
        # PART headers
        re.compile(r'^\s*PART\s+([IVX]+)(?:\s*[-–—].*?)?$', re.I | re.M),
        re.compile(r'^PART\s+([IVX]+)(?:\s*[-–—].*?)?$', re.I | re.M),

        # ITEM headers (with optional A-C suffix)
        re.compile(r'^\s*ITEM\s+(\d{1,2}[A-C]?)(?:[.\s–—-])', re.I | re.M),
        re.compile(r'^ITEM\s+(\d{1,2}[A-C]?)(?:[.\s–—-])', re.I | re.M),
        re.compile(r'Item\s+(\d{1,2}[A-C]?)(?:[.\s–—-])', re.I | re.M),

        # “1. BUSINESS”–style
        re.compile(r'^(\d{1,2}[A-C]?)\.\s+[A-Z][A-Za-z\s]{10,}', re.I | re.M),

        # Common named sections
        re.compile(r'^.{0,50}\b(BUSINESS)\b\s*$', re.I | re.M),
        re.compile(r'^.{0,50}\b(RISK FACTORS)\b\s*$', re.I | re.M),
        re.compile(r'^.{0,50}\b(LEGAL PROCEEDINGS)\b\s*$', re.I | re.M),
        re.compile(r'^.{0,50}\b(FINANCIAL STATEMENTS)\b\s*$', re.I | re.M),
        re.compile(r'^.{0,50}\b(MANAGEMENT\.S DISCUSSION)\b', re.I | re.M),
        re.compile(r'^.{0,50}\b(PROPERTIES)\b\s*$', re.I | re.M),
        re.compile(r'^.{0,50}\b(CONTROLS AND PROCEDURES)\b\s*$', re.I | re.M),
    ]

    all_matches = []
    for pat_idx, pat in enumerate(patterns):
        for m in pat.finditer(content):
            # extract the full line
            ln0 = content.rfind('\n', 0, m.start()) + 1
            ln1 = content.find('\n', m.end())
            if ln1 == -1:
                ln1 = len(content)
            full_line = content[ln0:ln1].strip()

            # simple filters
            if len(full_line) < 3 or len(full_line) > 400:
                continue
            up = full_line.upper()
            if ('TABLE' in up and ('START' in up or 'END' in up)) or full_line.count(' ') > 20:
                continue
            if any(tok in full_line.lower() for tok in ('table of contents','index')):
                continue

            grp = m.groups()
            sec_id = None
            sec_title = full_line

            if grp:
                first = grp[0].strip()
                # is it an item number or part numeral?
                if re.fullmatch(r'\d+[A-C]?', first, re.I):
                    sec_id = first.upper()
                elif re.fullmatch(r'[IVX]+', first, re.I):
                    sec_id = first.upper()

                # if there was a second capture (like “BUSINESS”), use it
                if len(grp) > 1 and grp[1]:
                    t = grp[1].strip()
                    t = re.sub(r'\[TABLE_END\].*$', '', t, flags=re.I).replace('|','').strip()
                    if t:
                        sec_title = t
                else:
                    # try to pull remainder of line after the ID
                    rem = full_line[m.end() - ln0 :].lstrip(" .–—-").strip()
                    if 0 < len(rem) < 200:
                        sec_title = rem

            # fallback canonical IDs for pure-named sections
            U = sec_title.upper()
            if not sec_id:
                if 'BUSINESS' in U:
                    sec_id = '1'
                elif 'RISK FACTORS' in U:
                    sec_id = '1A'
                elif 'LEGAL PROCEEDINGS' in U:
                    sec_id = '3'

            all_matches.append({
                'start': ln0,
                'end': ln1,
                'id': sec_id or 'UNKNOWN',
                'title': sec_title,
            })

    # sort & dedupe by start-pos
    all_matches.sort(key=lambda x: x['start'])
    unique = []
    seen_starts = set()
    for m in all_matches:
        if m['start'] not in seen_starts:
            seen_starts.add(m['start'])
            unique.append(m)

    # build DocumentSection (positional!)
    for idx, m in enumerate(unique):
        st = m['start']
        en = unique[idx+1]['start'] if idx+1 < len(unique) else len(content)
        sections.append(
            DocumentSection(
                m['id'],
                m['title'],
                st,
                en
            )
        )

    logger.info(f"Strategy 1 found {len(sections)} sections")
    return sections


In [24]:
# 2. Read in one of your 10-K or 10-Q text files
with open("processed_filings/AAPL/AAPL_10K_2020-10-30.txt", "r", encoding="utf-8") as f:
    content = f.read()

# 3. Run your improved strategy
sections = detect_sections_strategy_1_improved(content)

# 4. Print out what it found
for sec in sections:
    # adjust these attributes to match your DocumentSection fields
    print(f"ID: {sec.section_id} | Title: {sec.section_title!r} | "
          f"Starts @ {sec.start_char}, Ends @ {sec.end_char}")

TypeError: DocumentSection.__init__() got an unexpected keyword argument 'id'

In [25]:
# 1) Load one of your filings
path = "processed_filings/AAPL/AAPL_10K_2020-10-30.txt"
with open(path, "r", encoding="utf-8") as f:
    content = f.read()

# 2) Run your new regex-based detector
sections = detect_sections_strategy_1_improved(content)

# 3) Inspect the results
print(f"Found {len(sections)} sections:\n")
for sec in sections:
    # Assuming your DocumentSection has these attributes:
    print(f"{sec.section_id:>4} | {sec.section_title!r}")
    print(f"      chars {sec.start_char}–{sec.end_char}\n")


TypeError: DocumentSection.__init__() got an unexpected keyword argument 'id'

In [None]:
import os
import re
import pandas as pd
import tiktoken
from typing import List, Dict, Any, Tuple, Optional
from dataclasses import dataclass
from datetime import datetime
import logging
from pathlib import Path

# Set up logging to see what's happening
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Initialize tokenizer for accurate token counting
encoding = tiktoken.encoding_for_model("text-embedding-3-small")

# =============================================================================
# 1. SEC MAPPINGS WITH FALLBACKS
# =============================================================================

ITEM_NAME_MAP_10K = {
    "1": "Business",
    "1A": "Risk Factors",
    "1B": "Unresolved Staff Comments",
    "1C": "Cybersecurity",
    "2": "Properties",
    "3": "Legal Proceedings",
    "4": "Mine Safety Disclosures",
    "5": "Market for Registrant's Common Equity, Related Stockholder Matters and Issuer Purchases of Equity Securities",
    "6": "Reserved",
    "7": "Management's Discussion and Analysis of Financial Condition and Results of Operations",
    "7A": "Quantitative and Qualitative Disclosures About Market Risk",
    "8": "Financial Statements and Supplementary Data",
    "9": "Changes in and Disagreements With Accountants on Accounting and Financial Disclosure",
    "9A": "Controls and Procedures",
    "9B": "Other Information",
    "9C": "Disclosure Regarding Foreign Jurisdictions that Prevent Inspections",
    "10": "Directors, Executive Officers and Corporate Governance",
    "11": "Executive Compensation",
    "12": "Security Ownership of Certain Beneficial Owners and Management and Related Stockholder Matters",
    "13": "Certain Relationships and Related Transactions, and Director Independence",
    "14": "Principal Accountant Fees and Services",
    "15": "Exhibits, Financial Statement Schedules",
    "16": "Form 10-K Summary"
}

ITEM_NAME_MAP_10Q_PART_I = {
    "1": "Financial Statements",
    "2": "Management's Discussion and Analysis of Financial Condition and Results of Operations",
    "3": "Quantitative and Qualitative Disclosures About Market Risk",
    "4": "Controls and Procedures",
}

ITEM_NAME_MAP_10Q_PART_II = {
    "1": "Legal Proceedings", "1A": "Risk Factors",
    "2": "Unregistered Sales of Equity Securities and Use of Proceeds",
    "3": "Defaults Upon Senior Securities", "4": "Mine Safety Disclosures",
    "5": "Other Information", "6": "Exhibits",
}

# =============================================================================
# 2. DATA STRUCTURES FOR BETTER ORGANIZATION
# =============================================================================

@dataclass
class FilingMetadata:
    """Structured metadata for a filing"""
    ticker: str
    form_type: str
    filing_date: str
    fiscal_year: int
    fiscal_quarter: int
    file_path: str

@dataclass
class DocumentSection:
    """Represents a section of the document"""
    title: str
    content: str
    section_type: str  # 'item', 'part', 'intro', 'table'
    item_number: Optional[str] = None
    part: Optional[str] = None
    start_pos: int = 0
    end_pos: int = 0

@dataclass
class Chunk:
    """Final chunk with all metadata"""
    chunk_id: str
    text: str
    token_count: int
    chunk_type: str  # 'narrative', 'table', 'mixed'
    section_info: str
    filing_metadata: FilingMetadata
    chunk_index: int
    has_overlap: bool = False

# =============================================================================
# 3. ROBUST TEXT CLEANING
# =============================================================================

def clean_sec_text(text: str) -> str:
    """
    Clean SEC filing text more robustly
    """
    # Remove common SEC artifacts
    text = re.sub(r'UNITED STATES\s+SECURITIES AND EXCHANGE COMMISSION.*?FORM \d+[A-Z]*', '', text, flags=re.DOTALL | re.IGNORECASE)

    # Handle page breaks more intelligently
    text = text.replace('[PAGE BREAK]', '\n\n--- PAGE BREAK ---\n\n')

    # Preserve table boundaries but clean them up
    text = re.sub(r'\[TABLE_START\]', '\n\n=== TABLE START ===\n', text)
    text = re.sub(r'\[TABLE_END\]', '\n=== TABLE END ===\n\n', text)

    # Clean up excessive whitespace but preserve paragraph structure
    text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text)  # Multiple newlines -> double newline
    text = re.sub(r'[ \t]+', ' ', text)  # Multiple spaces/tabs -> single space
    text = re.sub(r'^\s+|\s+$', '', text, flags=re.MULTILINE)  # Trim lines

    return text.strip()

# =============================================================================
# 4. MULTI-STRATEGY SECTION DETECTION
# =============================================================================

def detect_sections_strategy_1_improved(content: str) -> List[DocumentSection]:
    """
    Improved Strategy 1: Patterns based on real SEC filing structure
    """
    sections = []

    # Much more comprehensive patterns based on your actual files
    patterns = [
        # PART patterns - handle various formats
        re.compile(r'^\s*PART\s+([IVX]+)(?:\s*[-–—].*?)?$', re.I | re.M),
        re.compile(r'^PART\s+([IVX]+)(?:\s*[-–—].*?)?$', re.I | re.M),

        # ITEM patterns - much more flexible
        re.compile(r'^\s*ITEM\s+(\d{1,2}[A-C]?)(?:[.\s–—])', re.I | re.M),
        re.compile(r'^ITEM\s+(\d{1,2}[A-C]?)(?:[.\s–—])', re.I | re.M),
        re.compile(r'Item\s+(\d{1,2}[A-C]?)(?:[.\s–—])', re.I | re.M),

        # Number-dot format common in SEC filings
        re.compile(r'^(\d{1,2}[A-C]?)\.\s+[A-Z][A-Za-z\s]{10,}', re.I | re.M),

        # Content-based patterns for known sections
        re.compile(r'^.{0,50}(BUSINESS)\s*$', re.I | re.M),
        re.compile(r'^.{0,50}(RISK FACTORS)\s*$', re.I | re.M),
        re.compile(r'^.{0,50}(LEGAL PROCEEDINGS)\s*$', re.I | re.M),
        re.compile(r'^.{0,50}(FINANCIAL STATEMENTS)\s*$', re.I | re.M),
        re.compile(r'^.{0,50}(MANAGEMENT.S DISCUSSION)\s*', re.I | re.M),
        re.compile(r'^.{0,50}(PROPERTIES)\s*$', re.I | re.M),
        re.compile(r'^.{0,50}(CONTROLS AND PROCEDURES)\s*$', re.I | re.M),
    ]

    all_matches = []

    for pattern_idx, pattern in enumerate(patterns):
        for match in pattern.finditer(content):
            line_start = content.rfind('\n', 0, match.start()) + 1
            line_end = content.find('\n', match.end())
            if line_end == -1:
                line_end = len(content)

            full_line = content[line_start:line_end].strip()

            if (len(full_line) > 400 or
                len(full_line) < 3 or
                ('TABLE' in full_line.upper() and ('START' in full_line.upper() or 'END' in full_line.upper())) or
                full_line.count(' ') > 20):
                continue

            if any(toc_indicator in full_line.lower() for toc_indicator in ['table of contents', 'index']):
                continue
            
            section_id = None
            section_title = full_line

            groups = match.groups()
            if groups:
                potential_id = groups[0].strip()
                is_item_id = re.match(r'^\d+[A-C]?$', potential_id, re.I)
                is_part_id = re.match(r'^[IVX]+$', potential_id, re.I)

                if is_item_id or is_part_id:
                    section_id = potential_id
                    if len(groups) > 1 and groups[1]:
                        section_title = groups[1].strip()
                        section_title = re.sub(r'\[TABLE_END\]\s*.*', '', section_title, flags=re.I).strip()
                        section_title = section_title.replace('|', '').strip()
                    else:
                        remaining_line_after_id = full_line[match.end() - line_start:].strip()
                        clean_line = re.sub(r'^\s*\.?\s*[-–—]?\s*', '', remaining_line_after_id).strip()
                        if clean_line and len(clean_line) < 200:
                            section_title = clean_line
                        else:
                             section_title = full_line
                else:
                    section_title = full_line
                    if 'BUSINESS' in full_line.upper() and not is_item_id and not is_part_id: section_id = '1'
                    elif 'RISK FACTORS' in full_line.upper() and not is_item_id and not is_part_id: section_id = '1A'

            all_matches.append({
                'start_pos': line_start,
                'end_pos': line_end,
                'full_line': full_line,
                'section_id': section_id if section_id else 'unknown',
                'section_title': section_title,
                'pattern_idx': pattern_idx,
                'match_start': match.start()
            })

    all_matches.sort(key=lambda x: (x['start_pos'], x['pattern_idx']))

    unique_matches = []
    if all_matches:
        unique_matches.append(all_matches[0])
        for i in range(1, len(all_matches)):
            current_match = all_matches[i]
            last_added_match = final_matches[-1]

            if current_match['start_pos'] - last_added_match['start_pos'] < 100:
                if current_match['section_id'] != 'unknown' and last_added_match['section_id'] == 'unknown':
                    final_matches[-1] = current_match
                elif current_match['section_id'] != 'unknown' and last_added_match['section_id'] != 'unknown' and current_match['pattern_idx'] < last_added_match['pattern_idx']:
                    final_matches[-1] = current_match
                elif current_match['section_id'] == last_added_match['section_id'] and len(current_match['section_title']) < len(last_added_match['section_title']) * 0.8:
                     final_matches[-1] = current_match
            else:
                final_matches.append(current_match)

    print(f"🔍 Improved detection found {len(unique_matches)} potential sections:")
    for i, match in enumerate(unique_matches[:15]):
        print(f"  {i+1}: {match['full_line'][:80]}...")

    # Convert to DocumentSection objects
    for i, match in enumerate(unique_matches):
        start_pos = match['start_pos']
        end_pos = unique_matches[i + 1]['start_pos'] if i + 1 < len(unique_matches) else len(content)

        section_content = content[start_pos:end_pos].strip()

        full_line_upper = match['full_line'].upper()
        section_id = match['section_id'].upper() if match['section_id'] != 'unknown' else None

        if 'PART' in full_line_upper and section_id:
            section_type = 'part'
            part = f"PART {section_id}"
            item_number = None
            title = f"Part {section_id}"
        elif ('ITEM' in full_line_upper or re.match(r'^\d+[A-C]?$', str(section_id))) and section_id:
            section_type = 'item'
            part = None
            item_number = section_id
            title = f"Item {section_id}"
        elif any(keyword in full_line_upper for keyword in
                ['BUSINESS', 'RISK', 'LEGAL', 'FINANCIAL', 'MANAGEMENT', 'PROPERTIES', 'CONTROLS']):
            section_type = 'named_section'
            part = None
            item_number = None
            title = match['full_line']
        else:
            section_type = 'content'
            part = None
            item_number = None
            title = match['full_line']

        sections.append(DocumentSection(
            title=title,
            content=section_content,
            section_type=section_type,
            item_number=item_number,
            part=part,
            start_pos=start_pos,
            end_pos=end_pos
        ))

    return sections

def detect_sections_strategy_2(content: str) -> List[DocumentSection]:
    """
    Strategy 2: Fallback using page breaks and heuristics
    """
    sections = []

    pages = content.split('--- PAGE BREAK ---')

    current_section = ""
    current_title = "Document Content"

    for i, page in enumerate(pages):
        page = page.strip()
        if not page:
            continue

        lines = page.split('\n')
        potential_headers = []

        for j, line in enumerate(lines[:10]):
            line = line.strip()
            if (len(line) < 100 and
                (re.search(r'\b(ITEM|PART)\b', line, re.IGNORECASE) or
                 re.search(r'\b(BUSINESS|RISK FACTORS|FINANCIAL STATEMENTS)\b', line, re.IGNORECASE))):
                potential_headers.append((j, line))

        if potential_headers:
            if current_section:
                sections.append(DocumentSection(
                    title=current_title,
                    content=current_section.strip(),
                    section_type='content',
                    start_pos=0,
                    end_pos=len(current_section)
                ))

            current_title = potential_headers[0][1]
            current_section = page
        else:
            current_section += "\n\n" + page

    if current_section:
        sections.append(DocumentSection(
            title=current_title,
            content=current_section.strip(),
            section_type='content',
            start_pos=0,
            end_pos=len(current_section)
        ))

    return sections

def detect_sections_robust_old(content: str) -> List[DocumentSection]:
    """
    Multi-strategy section detection with fallbacks (original version)
    """
    logger.info("Attempting Strategy 1: Regex-based section detection")
    sections = detect_sections_strategy_1_improved(content)

    if len(sections) >= 3:
        logger.info(f"Strategy 1 successful: Found {len(sections)} sections")
        return sections

    logger.warning("Strategy 1 failed, trying Strategy 2: Page-based detection")
    sections = detect_sections_strategy_2(content)

    if len(sections) >= 2:
        logger.info(f"Strategy 2 successful: Found {len(sections)} sections")
        return sections

    logger.warning("All strategies failed, creating single section")
    return [DocumentSection(
        title="Full Document",
        content=content,
        section_type='document',
        start_pos=0,
        end_pos=len(content)
    )]

def create_section_info(section: DocumentSection, form_type: str) -> str:
    """
    Create human-readable section information for DocumentSection objects,
    using form_type to select the correct item name map.
    Handles 10K/10Q specific mappings and part/item inheritance.
    """
    item_number = section.item_number
    section_type = section.section_type
    part_number = section.part

    if section_type == 'item' and item_number:
        if form_type == '10K':
            item_name = ITEM_NAME_MAP_10K.get(item_number, "Unknown Section")
            return f"Item {item_number} - {item_name}"
        elif form_type == '10Q':
            if part_number == 'PART I':
                item_name = ITEM_NAME_MAP_10Q_PART_I.get(item_number, "Unknown Section")
                return f"Part I, Item {item_number} - {item_name}"
            elif part_number == 'PART II':
                item_name = ITEM_NAME_MAP_10Q_PART_II.get(item_number, "Unknown Section")
                return f"Part II, Item {item_number} - {item_name}"
            else:
                if item_number in ITEM_NAME_MAP_10Q_PART_I:
                    item_name = ITEM_NAME_MAP_10Q_PART_I[item_number]
                    return f"Part I, Item {item_number} - {item_name}"
                elif item_number in ITEM_NAME_MAP_10Q_PART_II:
                    item_name = ITEM_NAME_MAP_10Q_PART_II[item_number]
                    return f"Part II, Item {item_number} - {item_name}"
                return f"Item {item_number} - Unknown 10Q Section"
    
    elif section_type == 'part' and part_number:
        if "Item" in section.title and section.item_number:
            clean_title_suffix = section.title.replace(part_number, '').strip(' -.')
            return f"{part_number} - {clean_title_suffix}"
        return part_number

    return section.title or "Document Content"


def detect_sections_universal_sec(content: str) -> List[DocumentSection]:
    """
    Universal section detection for SEC filings with table-based formatting.
    Ensures content for each DocumentSection is correctly sliced.
    """
    sections = []

    if not content:
        logger.info("Empty content provided to detect_sections_universal_sec. Returning empty sections.")
        return sections

    patterns = [
        re.compile(r'(?i)\[TABLE_START\]\s*Item\s*(\d{1,2}[A-C]?)\.?\s*\|\s*([^\[]+?)\s*\[TABLE_END\]', re.DOTALL),
        re.compile(r'(?i)\[TABLE_START\]\s*Item\s*(\d{1,2}[A-C]?)\.?\s*\|\s*([^|]+)', re.DOTALL),
        re.compile(r'(?i)\[TABLE_START\]\s*PART\s*([IVX]+)\s*\|\s*([^\[]+?)\s*\[TABLE_END\]', re.DOTALL),
        re.compile(r'(?i)\[TABLE_START\]\s*PART\s*([IVX]+)\s*\|\s*([^|]+)', re.DOTALL),
        re.compile(r'(?i)\[TABLE_START\]\s*PART\s*([IVX]+)\s*\[TABLE_END\]', re.DOTALL),
        re.compile(r'^\s*Item\s*(\d{1,2}[A-C]?)\.?\s*([^\n]+)', re.I | re.M),
        re.compile(r'Item\s*(\d{1,2}[A-C]?)\.?\s*\|\s*([^|]+)', re.I | re.DOTALL),
        re.compile(r'^\s*PART\s*([IVX]+)\.?\s*([^\n]*)', re.I | re.M),
        re.compile(r'PART\s*([IVX]+)\s*\|\s*([^|]+)', re.I | re.DOTALL),
        re.compile(r'^\s*(\d{1,2}[A-C]?)\.\s+[A-Z][A-Za-z\s]{10,}', re.I | re.M),
        re.compile(r'(?i)\[TABLE_START\]\s*(\d{1,2}[A-C]?)\.?\s*\|\s*([^|]+)', re.I | re.DOTALL),
        re.compile(r'^\s*(BUSINESS|RISK FACTORS|LEGAL PROCEEDINGS|FINANCIAL STATEMENTS|MANAGEMENT\'S DISCUSSION AND ANALYSIS OF FINANCIAL CONDITION AND RESULTS OF OPERATIONS|PROPERTIES|CONTROLS AND PROCEDURES)\s*$', re.I | re.M)
    ]

    all_matches = []

    for pattern_idx, pattern in enumerate(patterns):
        for match in pattern.finditer(content):
            line_start = content.rfind('\n', 0, match.start()) + 1
            line_end = content.find('\n', match.end())
            if line_end == -1:
                line_end = len(content)

            full_line = content[line_start:line_end].strip()

            if (len(full_line) > 400 or
                len(full_line) < 3 or
                ('TABLE' in full_line.upper() and ('START' in full_line.upper() or 'END' in full_line.upper())) or
                full_line.count(' ') > 20):
                continue

            if any(toc_indicator in full_line.lower() for toc_indicator in ['table of contents', 'index']):
                continue
            
            section_id = None
            section_title = full_line

            groups = match.groups()
            if groups:
                potential_id = groups[0].strip()
                is_item_id = re.match(r'^\d+[A-C]?$', potential_id, re.I)
                is_part_id = re.match(r'^[IVX]+$', potential_id, re.I)

                if is_item_id or is_part_id:
                    section_id = potential_id
                    if len(groups) > 1 and groups[1]:
                        section_title = groups[1].strip()
                        section_title = re.sub(r'\[TABLE_END\]\s*.*', '', section_title, flags=re.I).strip()
                        section_title = section_title.replace('|', '').strip()
                    else:
                        remaining_line_after_id = full_line[match.end() - line_start:].strip()
                        clean_line = re.sub(r'^\s*\.?\s*[-–—]?\s*', '', remaining_line_after_id).strip()
                        if clean_line and len(clean_line) < 200:
                            section_title = clean_line
                        else:
                             section_title = full_line
                else:
                    section_title = full_line
                    if 'BUSINESS' in full_line.upper() and not is_item_id and not is_part_id: section_id = '1'
                    elif 'RISK FACTORS' in full_line.upper() and not is_item_id and not is_part_id: section_id = '1A'

            all_matches.append({
                'start_pos': match.start(),
                'end_pos': match.end(),
                'full_line': full_line,
                'section_id': section_id if section_id else 'unknown',
                'section_title': section_title,
                'pattern_idx': pattern_idx,
                'match_start': match.start()
            })

    all_matches.sort(key=lambda x: (x['start_pos'], x['pattern_idx']))

    final_matches = []
    if all_matches:
        final_matches.append(all_matches[0])
        for i in range(1, len(all_matches)):
            current_match = all_matches[i]
            last_added_match = final_matches[-1]

            if current_match['start_pos'] - last_added_match['start_pos'] < 100:
                if current_match['section_id'] != 'unknown' and last_added_match['section_id'] == 'unknown':
                    final_matches[-1] = current_match
                elif current_match['section_id'] != 'unknown' and last_added_match['section_id'] != 'unknown' and current_match['pattern_idx'] < last_added_match['pattern_idx']:
                    final_matches[-1] = current_match
                elif current_match['section_id'] == last_added_match['section_id'] and len(current_match['section_title']) < len(last_added_match['section_title']) * 0.8:
                     final_matches[-1] = current_match
            else:
                final_matches.append(current_match)

    logger.info(f"🔍 Universal SEC detection found {len(final_matches)} unique sections:")
    for i, match in enumerate(final_matches[:15]):
        logger.info(f"  {i+1}: Item/Part {match['section_id']} - {match['section_title'][:60]}...")

    final_document_sections = []
    current_part = None

    for i, match in enumerate(final_matches):
        start_pos = match['start_pos']
        end_pos = final_matches[i + 1]['start_pos'] if i + 1 < len(final_matches) else len(content)

        section_content = content[start_pos:end_pos].strip()

        section_id = match['section_id'].upper()
        title = match['section_title']

        section_type = 'content'
        item_number = None
        part = None

        if re.match(r'^[IVX]+$', section_id):
            section_type = 'part'
            part = f"PART {section_id}"
            current_part = part
            clean_title_part = title.upper().replace(part, '').strip(' -.')
            if clean_title_part:
                title = f"{part} - {clean_title_part}"
            else:
                title = part
        elif re.match(r'^\d+[A-C]?$', section_id):
            section_type = 'item'
            item_number = section_id
            part = current_part
            clean_title_item = title.upper().replace(f"ITEM {item_number}", '').strip(' -.')
            if clean_title_item:
                title = f"Item {item_number} - {clean_title_item}"
            else:
                title = f"Item {item_number}"
        elif any(keyword in title.upper() for keyword in ['BUSINESS', 'RISK FACTORS', 'LEGAL PROCEEDINGS', 'FINANCIAL STATEMENTS', 'MANAGEMENT\'S DISCUSSION', 'PROPERTIES', 'CONTROLS AND PROCEDURES']):
            section_type = 'named_section'

        logger.debug(f"Creating DocumentSection: Title='{title}', Type='{section_type}', Item='{item_number}', Part='{part}', Content len: {len(section_content)}, Start: {start_pos}, End: {end_pos}") # Added debug

        final_document_sections.append(DocumentSection(
            title=title,
            content=section_content,
            section_type=section_type,
            item_number=item_number,
            part=part,
            start_pos=start_pos,
            end_pos=end_pos
        ))

    return final_document_sections

def detect_sections_from_toc_universal(content: str) -> List[DocumentSection]:
    """
    Extract sections from table of contents - works for any SEC filing.
    This function primarily identifies section titles and item numbers from TOC,
    but does not extract their content directly.
    """
    sections = []

    if not content:
        logger.info("Empty content provided to detect_sections_from_toc_universal. Returning empty sections.")
        return sections

    toc_patterns = [
        re.compile(r'(?i)INDEX.*?(?=\s*--- PAGE BREAK ---)', re.DOTALL),
        re.compile(r'(?i)TABLE OF CONTENTS.*?(?=\s*--- PAGE BREAK ---)', re.DOTALL),
        re.compile(r'(?i)FORM 10-[KQ].*?INDEX.*?(?=\s*--- PAGE BREAK ---)', re.DOTALL),
        re.compile(re.escape('[TABLE_START]') + r'.*?Page.*?' + re.escape('[TABLE_END]') + r'.*?(?=\s*--- PAGE BREAK ---)', re.DOTALL),
    ]

    toc_content = ""
    for pattern in toc_patterns:
        match = pattern.search(content)
        if match:
            toc_content = match.group(0)
            break

    if not toc_content:
        logger.warning("No table of contents found in detect_sections_from_toc_universal.")
        return sections

    logger.info(f"Found table of contents ({len(toc_content)} chars)")

    # Define patterns for items/parts within the TOC.
    item_patterns = [
        # Pattern 1: Multi-column TOC entry with PART, Item, and Title (e.g., KO 10-Q)
        # Captures: (Optional Page Num) | PART ID | PART Title (Optional) | Item ID | Item Title (Optional) | Page Num
        re.compile(r'(?i)(?:Page\s*\|\s*)?\s*(PART\s*([IVX]+)\.?(?:\s*([^\n|]+?))?\s*\|\s*)?Item\s*(\d{1,2}[A-C]?)\.?\s*\|\s*([^|]+?)(?:\s*\|\s*\d+)?', re.M),
        
        # Pattern 2: Simpler Item/Part line with Title, pipe-separated. Catches "Item 1. | Financial Statements | 3"
        re.compile(r'(?i)(?:Item|PART)\s*(\d{1,2}[A-C]?|[IVX]+)\.?\s*\|\s*([^\n|]+?)(?:\s*\|\s*\d+)?', re.M),
        
        # Pattern 3: Standalone Item/Part line with Title (no pipes separating title)
        re.compile(r'(?i)^\s*(?:Item|PART)\s*(\d{1,2}[A-C]?|[IVX]+)\.?\s*([^\n|]+)', re.M),
        
        # Pattern 4: Generic TOC titles, often sub-sections or long descriptions. Must be long enough, starts with capital.
        re.compile(r'^\s*([A-Z][A-Za-z0-9\s\',&\(\)\-\.]{15,})\s*(?:\|\s*\d+)?$', re.M),
        
        # Pattern 5: Simple "PART X" line
        re.compile(r'(?i)^\s*PART\s*([IVX]+)\s*$', re.M),
        
        # Pattern 6: Number-dot format (e.g., "1. Business") usually at start of line
        re.compile(r'^\s*(\d{1,2}[A-C]?)\.\s*([^\n|]+)', re.M),
    ]

    found_items = []
    current_part_id_context = None

    if toc_content:
        for line in toc_content.split('\n'):
            line = line.strip()
            if not line:
                continue
            
            # Strict filtering of TOC lines to remove noise
            if any(kw in line.lower() for kw in ['page', 'signatures', 'exhibit', 'index', 'table of contents']) and len(line) < 30:
                continue
            if re.match(r'^\s*\d+\s*$', line.strip()): # Just a page number
                continue
            if re.match(r'^\s*(\d{1,2}[A-C]?)\s*$', line.strip()): # Just "1" or "1A"
                continue
            if len(line) < 5: # Very short lines
                continue
            if re.search(r'\d+\s*$', line.strip()) and not re.match(r'(?:Item|PART)\s*(\d{1,2}[A-C]?|[IVX]+)\.?', line, re.I): # Looks like page number at end, but not a clear item/part line
                continue

            for pattern in item_patterns:
                match = pattern.search(line)
                if match:
                    item_id = None
                    item_title = ""
                    section_type_raw = 'unknown'

                    if pattern == item_patterns[0]: # Pattern 1: Complex multi-column TOC
                        part_id_cand = match.group(2) if len(match.groups()) >= 2 else None
                        part_title_from_group = match.group(3) if len(match.groups()) >= 3 else None
                        item_id = match.group(4).strip() if len(match.groups()) >= 4 else None
                        item_title = match.group(5).strip() if len(match.groups()) >= 5 else ""
                        
                        if part_id_cand:
                            current_part_id_context = f"PART {part_id_cand.strip()}"
                            title_for_part = part_title_from_group.strip() if part_title_from_group else f"PART {part_id_cand.strip()}"
                            found_items.append((part_id_cand.strip(), title_for_part, 'part', current_part_id_context))
                        
                        if item_id:
                            section_type_raw = 'item'
                            title_for_item = item_title.strip() if item_title else f"Item {item_id.strip()}"
                            found_items.append((item_id.strip(), title_for_item, section_type_raw, current_part_id_context))
                            break

                    elif pattern in [item_patterns[1], item_patterns[2], item_patterns[5]]: # Patterns with ID as group 1, Title as group 2 (or inferred from line)
                        item_id = match.group(1).strip() if match.group(1) else None
                        item_title = match.group(2).strip() if len(match.groups()) > 1 and match.group(2) else ""

                        is_item = re.match(r'^\d+[A-C]?$', item_id, re.I)
                        is_part = re.match(r'^[IVX]+$', item_id, re.I)

                        if is_item:
                            section_type_raw = 'item'
                            found_items.append((item_id, item_title, section_type_raw, current_part_id_context))
                            break
                        elif is_part:
                            section_type_raw = 'part'
                            current_part_id_context = f"PART {item_id}"
                            found_items.append((item_id, item_title, section_type_raw, current_part_id_context))
                            break
                    
                    elif pattern == item_patterns[3]: # Generic titles (Pattern 4: e.g., "Consolidated Statements of Cash Flows")
                        item_title = match.group(1).strip()
                        if item_title and len(item_title) > 10 and not re.match(r'^\d+(\.\d+)?$', item_title.replace('.', '').strip()):
                             found_items.append((None, item_title, 'named_section', current_part_id_context))
                             break
                    
                    elif pattern == item_patterns[4]: # Simple "PART X" line (Pattern 5)
                        item_id = match.group(1).strip()
                        current_part_id_context = f"PART {item_id}"
                        found_items.append((item_id, f"PART {item_id}", 'part', current_part_id_context))
                        break

    unique_items = []
    seen_keys = set()
    
    processed_items_for_dedup = []
    for item_data in found_items:
        item_id, title_raw, section_type_raw, part_context = item_data
        
        cleaned_title = re.sub(r'\|\s*\d+\s*$', '', title_raw).strip()
        cleaned_title = re.sub(r'\s*\.\s*$', '', cleaned_title).strip()
        cleaned_title = re.sub(r'\[TABLE_END\]\s*.*', '', cleaned_title, flags=re.I).strip()
        cleaned_title = re.sub(r'\s+', ' ', cleaned_title).strip()
        
        if not cleaned_title or len(cleaned_title) < 5 or re.match(r'^\d+(\.\d+)?$', cleaned_title):
            continue

        processed_items_for_dedup.append({
            'item_id': item_id,
            'title': cleaned_title,
            'type': section_type_raw,
            'part': part_context
        })

    processed_items_for_dedup.sort(key=lambda x: (x['part'] if x['part'] else '', x['item_id'] if x['item_id'] else '', x['title']))

    for item in processed_items_for_dedup:
        key = (item['item_id'], item['title'], item['type'], item['part'])
        if key not in seen_keys:
            unique_items.append(DocumentSection(
                title=item['title'],
                content="",
                section_type=item['type'],
                item_number=item['item_id'] if item['type'] == 'item' else None,
                part=item['part'],
                start_pos=0,
                end_pos=0
            ))
            seen_keys.add(key)
    
    logger.info(f"Extracted {len(unique_items)} sections from table of contents:")
    for i, sec in enumerate(unique_items[:15]):
        logger.info(f"  • ID: {sec.item_number if sec.item_number else sec.part if sec.part else 'None'}, Type: {sec.section_type}, Title: {sec.title[:60]}...")

    return unique_items


def detect_sections_robust_universal(content: str) -> List[DocumentSection]:
    """
    Universal robust section detection for all SEC filings.
    Prioritizes direct pattern matching (which handles tables well), then TOC, then page-based.
    """
    logger.info("Attempting universal SEC section detection")

    sections_strategy1 = detect_sections_universal_sec(content)

    if len(sections_strategy1) >= 3:
        logger.info(f"Universal detection successful (Strategy 1): Found {len(sections_strategy1)} sections.")
        return sections_strategy1

    logger.warning("Direct detection found few sections, analyzing table of contents.")
    toc_entries = detect_sections_from_toc_universal(content)

    if toc_entries and len(toc_entries) >= 3:
        logger.info(f"TOC analysis found {len(toc_entries)} potential sections. Attempting to extract content based on TOC titles.")

        combined_sections = []
        current_content_pos = 0

        for i, toc_entry in enumerate(toc_entries):
            pattern_parts = []
            
            if toc_entry.item_number:
                pattern_parts.append(r'Item\s*' + re.escape(toc_entry.item_number) + r'\.?')
            if toc_entry.part and toc_entry.part.startswith("PART "):
                pattern_parts.append(r'PART\s*' + re.escape(toc_entry.part.replace("PART ", "")) + r'\.?')
            
            if toc_entry.title:
                cleaned_title_for_regex = re.sub(r'\|\s*\d+', '', toc_entry.title).strip()
                cleaned_title_for_regex = re.sub(r'\s*\.\s*$', '', cleaned_title_for_regex).strip()
                cleaned_title_for_regex = re.sub(r'\s+-\s+', r'\s*[-–—]?\s*', cleaned_title_for_regex)
                cleaned_title_for_regex = re.sub(r'\s+', r'\s+', cleaned_title_for_regex)
                
                if len(cleaned_title_for_regex) > 5:
                    pattern_parts.append(r'\b?' + re.escape(cleaned_title_for_regex) + r'\b?')
                else:
                    pattern_parts.append(re.escape(cleaned_title_for_regex))
                
            if not pattern_parts:
                logger.warning(f"No valid pattern parts for TOC entry: '{toc_entry.title}'. Skipping.")
                continue

            search_pattern = re.compile(r'(?i)^\s*(?:' + '|'.join(pattern_parts) + r')', re.M)
            
            match = search_pattern.search(content, pos=current_content_pos)

            if match:
                start_pos = match.start()
                
                next_start_pos = len(content)
                if i + 1 < len(toc_entries):
                    next_toc_entry = toc_entries[i+1]
                    next_pattern_parts = []
                    if next_toc_entry.item_number:
                        next_pattern_parts.append(r'Item\s*' + re.escape(next_toc_entry.item_number) + r'\.?')
                    elif next_toc_entry.part and next_toc_entry.part.startswith("PART "):
                        next_pattern_parts.append(r'PART\s*' + re.escape(next_toc_entry.part.replace("PART ", "")) + r'\.?')
                    if next_toc_entry.title:
                        next_cleaned_title_for_regex = re.sub(r'\|\s*\d+', '', next_toc_entry.title).strip()
                        next_cleaned_title_for_regex = re.sub(r'\s*\.\s*$', '', next_cleaned_title_for_regex).strip()
                        next_cleaned_title_for_regex = re.sub(r'\s+-\s+', r'\s*[-–—]?\s*', next_cleaned_title_for_regex)
                        next_cleaned_title_for_regex = re.sub(r'\s+', r'\s+', next_cleaned_title_for_regex)
                        if len(next_cleaned_title_for_regex) > 5:
                            next_pattern_parts.append(r'\b?' + re.escape(next_cleaned_title_for_regex) + r'\b?')
                        else:
                            next_pattern_parts.append(re.escape(next_cleaned_title_for_regex))

                    if next_pattern_parts:
                        next_pattern = re.compile(r'(?i)^\s*(?:' + '|'.join(next_pattern_parts) + r')', re.M)
                        next_match = next_pattern.search(content, pos=match.end())
                        if next_match:
                            next_start_pos = next_match.start()
                
                section_content = content[start_pos:next_start_pos].strip()
                
                combined_sections.append(DocumentSection(
                    title=toc_entry.title,
                    content=section_content,
                    section_type=toc_entry.section_type,
                    item_number=toc_entry.item_number,
                    part=toc_entry.part,
                    start_pos=start_pos,
                    end_pos=next_start_pos
                ))
                current_content_pos = next_start_pos
            else:
                logger.warning(f"Could not find content for TOC entry: '{toc_entry.title}'. This section might be merged with previous or skipped.")

        if len(combined_sections) >= 3:
            logger.info(f"Universal detection successful (TOC-based content mapping): Found {len(combined_sections)} sections.")
            return combined_sections
        else:
            logger.warning("TOC-based content mapping yielded few sections. Falling back to page-based detection.")


    logger.warning("Trying page-based detection as fallback.")
    sections_strategy2 = detect_sections_strategy_2(content)

    if len(sections_strategy2) >= 2:
        logger.info(f"Page-based detection successful: Found {len(sections_strategy2)} sections.")
        return sections_strategy2

    logger.warning("All strategies failed, creating single section.")
    return [DocumentSection(
        title="Full Document",
        content=content,
        section_type='document',
        start_pos=0,
        end_pos=len(content)
    )]

In [None]:
import os
import re
import pandas as pd
import tiktoken
from typing import List, Dict, Any, Tuple, Optional
from dataclasses import dataclass
from datetime import datetime
import logging
from pathlib import Path

# Set up logging to see what's happening
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Initialize tokenizer for accurate token counting
encoding = tiktoken.encoding_for_model("text-embedding-3-small")

# =============================================================================
# 1. SEC MAPPINGS WITH FALLBACKS
# =============================================================================

ITEM_NAME_MAP_10K = {
    "1": "Business",
    "1A": "Risk Factors",
    "1B": "Unresolved Staff Comments",
    "1C": "Cybersecurity",
    "2": "Properties",
    "3": "Legal Proceedings",
    "4": "Mine Safety Disclosures",
    "5": "Market for Registrant's Common Equity, Related Stockholder Matters and Issuer Purchases of Equity Securities",
    "6": "Reserved",
    "7": "Management's Discussion and Analysis of Financial Condition and Results of Operations",
    "7A": "Quantitative and Qualitative Disclosures About Market Risk",
    "8": "Financial Statements and Supplementary Data",
    "9": "Changes in and Disagreements With Accountants on Accounting and Financial Disclosure",
    "9A": "Controls and Procedures",
    "9B": "Other Information",
    "9C": "Disclosure Regarding Foreign Jurisdictions that Prevent Inspections",
    "10": "Directors, Executive Officers and Corporate Governance",
    "11": "Executive Compensation",
    "12": "Security Ownership of Certain Beneficial Owners and Management and Related Stockholder Matters",
    "13": "Certain Relationships and Related Transactions, and Director Independence",
    "14": "Principal Accountant Fees and Services",
    "15": "Exhibits, Financial Statement Schedules",
    "16": "Form 10-K Summary"
}

ITEM_NAME_MAP_10Q_PART_I = {
    "1": "Financial Statements",
    "2": "Management's Discussion and Analysis of Financial Condition and Results of Operations",
    "3": "Quantitative and Qualitative Disclosures About Market Risk",
    "4": "Controls and Procedures",
}

ITEM_NAME_MAP_10Q_PART_II = {
    "1": "Legal Proceedings", "1A": "Risk Factors",
    "2": "Unregistered Sales of Equity Securities and Use of Proceeds",
    "3": "Defaults Upon Senior Securities", "4": "Mine Safety Disclosures",
    "5": "Other Information", "6": "Exhibits",
}

# =============================================================================
# 2. DATA STRUCTURES FOR BETTER ORGANIZATION
# =============================================================================

@dataclass
class FilingMetadata:
    """Structured metadata for a filing"""
    ticker: str
    form_type: str
    filing_date: str
    fiscal_year: int
    fiscal_quarter: int
    file_path: str

@dataclass
class DocumentSection:
    """Represents a section of the document"""
    title: str
    content: str
    section_type: str  # 'item', 'part', 'intro', 'table'
    item_number: Optional[str] = None
    part: Optional[str] = None
    start_pos: int = 0
    end_pos: int = 0

@dataclass
class Chunk:
    """Final chunk with all metadata"""
    chunk_id: str
    text: str
    token_count: int
    chunk_type: str  # 'narrative', 'table', 'mixed'
    section_info: str
    filing_metadata: FilingMetadata
    chunk_index: int
    has_overlap: bool = False

# =============================================================================
# 3. ROBUST TEXT CLEANING
# =============================================================================

def clean_sec_text(text: str) -> str:
    """
    Clean SEC filing text more robustly
    """
    # Remove common SEC artifacts
    text = re.sub(r'UNITED STATES\s+SECURITIES AND EXCHANGE COMMISSION.*?FORM \d+[A-Z]*', '', text, flags=re.DOTALL | re.IGNORECASE)

    # Handle page breaks more intelligently
    text = text.replace('[PAGE BREAK]', '\n\n--- PAGE BREAK ---\n\n')

    # Preserve table boundaries but clean them up
    text = re.sub(r'\[TABLE_START\]', '\n\n=== TABLE START ===\n', text)
    text = re.sub(r'\[TABLE_END\]', '\n=== TABLE END ===\n\n', text)

    # Clean up excessive whitespace but preserve paragraph structure
    text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text)  # Multiple newlines -> double newline
    text = re.sub(r'[ \t]+', ' ', text)  # Multiple spaces/tabs -> single space
    text = re.sub(r'^\s+|\s+$', '', text, flags=re.MULTILINE)  # Trim lines

    return text.strip()

# =============================================================================
# 4. MULTI-STRATEGY SECTION DETECTION
# =============================================================================

def detect_sections_strategy_1_improved(content: str) -> List[DocumentSection]:
    """
    Improved Strategy 1: Patterns based on real SEC filing structure
    """
    sections = []

    # Much more comprehensive patterns based on your actual files
    patterns = [
        # PART patterns - handle various formats
        re.compile(r'^\s*PART\s+([IVX]+)(?:\s*[-–—].*?)?$', re.I | re.M),
        re.compile(r'^PART\s+([IVX]+)(?:\s*[-–—].*?)?$', re.I | re.M),

        # ITEM patterns - much more flexible
        re.compile(r'^\s*ITEM\s+(\d{1,2}[A-C]?)(?:[.\s–—])', re.I | re.M),
        re.compile(r'^ITEM\s+(\d{1,2}[A-C]?)(?:[.\s–—])', re.I | re.M),
        re.compile(r'Item\s+(\d{1,2}[A-C]?)(?:[.\s–—])', re.I | re.M),

        # Number-dot format common in SEC filings
        re.compile(r'^(\d{1,2}[A-C]?)\.\s+[A-Z][A-Za-z\s]{10,}', re.I | re.M),

        # Content-based patterns for known sections
        re.compile(r'^.{0,50}(BUSINESS)\s*$', re.I | re.M),
        re.compile(r'^.{0,50}(RISK FACTORS)\s*$', re.I | re.M),
        re.compile(r'^.{0,50}(LEGAL PROCEEDINGS)\s*$', re.I | re.M),
        re.compile(r'^.{0,50}(FINANCIAL STATEMENTS)\s*$', re.I | re.M),
        re.compile(r'^.{0,50}(MANAGEMENT.S DISCUSSION)\s*', re.I | re.M),
        re.compile(r'^.{0,50}(PROPERTIES)\s*$', re.I | re.M),
        re.compile(r'^.{0,50}(CONTROLS AND PROCEDURES)\s*$', re.I | re.M),
    ]

    all_matches = []

    for pattern_idx, pattern in enumerate(patterns):
        for match in pattern.finditer(content):
            line_start = content.rfind('\n', 0, match.start()) + 1
            line_end = content.find('\n', match.end())
            if line_end == -1:
                line_end = len(content)

            full_line = content[line_start:line_end].strip()

            if (len(full_line) > 400 or
                len(full_line) < 3 or
                ('TABLE' in full_line.upper() and ('START' in full_line.upper() or 'END' in full_line.upper())) or
                full_line.count(' ') > 20):
                continue

            if any(toc_indicator in full_line.lower() for toc_indicator in ['table of contents', 'index']):
                continue
            
            section_id = None
            section_title = full_line

            groups = match.groups()
            if groups:
                potential_id = groups[0].strip()
                is_item_id = re.match(r'^\d+[A-C]?$', potential_id, re.I)
                is_part_id = re.match(r'^[IVX]+$', potential_id, re.I)

                if is_item_id or is_part_id:
                    section_id = potential_id
                    if len(groups) > 1 and groups[1]:
                        section_title = groups[1].strip()
                        section_title = re.sub(r'\[TABLE_END\]\s*.*', '', section_title, flags=re.I).strip()
                        section_title = section_title.replace('|', '').strip()
                    else:
                        remaining_line_after_id = full_line[match.end() - line_start:].strip()
                        clean_line = re.sub(r'^\s*\.?\s*[-–—]?\s*', '', remaining_line_after_id).strip()
                        if clean_line and len(clean_line) < 200:
                            section_title = clean_line
                        else:
                             section_title = full_line
                else:
                    section_title = full_line
                    if 'BUSINESS' in full_line.upper() and not is_item_id and not is_part_id: section_id = '1'
                    elif 'RISK FACTORS' in full_line.upper() and not is_item_id and not is_part_id: section_id = '1A'

            all_matches.append({
                'start_pos': line_start,
                'end_pos': line_end,
                'full_line': full_line,
                'section_id': section_id if section_id else 'unknown',
                'section_title': section_title,
                'pattern_idx': pattern_idx,
                'match_start': match.start()
            })

    all_matches.sort(key=lambda x: (x['start_pos'], x['pattern_idx']))

    unique_matches = []
    if all_matches:
        unique_matches.append(all_matches[0])
        for i in range(1, len(all_matches)):
            current_match = all_matches[i]
            last_added_match = final_matches[-1]

            if current_match['start_pos'] - last_added_match['start_pos'] < 100:
                if current_match['section_id'] != 'unknown' and last_added_match['section_id'] == 'unknown':
                    final_matches[-1] = current_match
                elif current_match['section_id'] != 'unknown' and last_added_match['section_id'] != 'unknown' and current_match['pattern_idx'] < last_added_match['pattern_idx']:
                    final_matches[-1] = current_match
                elif current_match['section_id'] == last_added_match['section_id'] and len(current_match['section_title']) < len(last_added_match['section_title']) * 0.8:
                     final_matches[-1] = current_match
            else:
                final_matches.append(current_match)

    print(f"🔍 Improved detection found {len(unique_matches)} potential sections:")
    for i, match in enumerate(unique_matches[:15]):
        print(f"  {i+1}: {match['full_line'][:80]}...")

    # Convert to DocumentSection objects
    for i, match in enumerate(unique_matches):
        start_pos = match['start_pos']
        end_pos = unique_matches[i + 1]['start_pos'] if i + 1 < len(unique_matches) else len(content)

        section_content = content[start_pos:end_pos].strip()

        full_line_upper = match['full_line'].upper()
        section_id = match['section_id'].upper() if match['section_id'] != 'unknown' else None

        if 'PART' in full_line_upper and section_id:
            section_type = 'part'
            part = f"PART {section_id}"
            item_number = None
            title = f"Part {section_id}"
        elif ('ITEM' in full_line_upper or re.match(r'^\d+[A-C]?$', str(section_id))) and section_id:
            section_type = 'item'
            part = None
            item_number = section_id
            title = f"Item {section_id}"
        elif any(keyword in full_line_upper for keyword in
                ['BUSINESS', 'RISK', 'LEGAL', 'FINANCIAL', 'MANAGEMENT', 'PROPERTIES', 'CONTROLS']):
            section_type = 'named_section'
            part = None
            item_number = None
            title = match['full_line']
        else:
            section_type = 'content'
            part = None
            item_number = None
            title = match['full_line']

        sections.append(DocumentSection(
            title=title,
            content=section_content,
            section_type=section_type,
            item_number=item_number,
            part=part,
            start_pos=start_pos,
            end_pos=end_pos
        ))

    return sections

def detect_sections_strategy_2(content: str) -> List[DocumentSection]:
    """
    Strategy 2: Fallback using page breaks and heuristics
    """
    sections = []

    pages = content.split('--- PAGE BREAK ---')

    current_section = ""
    current_title = "Document Content"

    for i, page in enumerate(pages):
        page = page.strip()
        if not page:
            continue

        lines = page.split('\n')
        potential_headers = []

        for j, line in enumerate(lines[:10]):
            line = line.strip()
            if (len(line) < 100 and
                (re.search(r'\b(ITEM|PART)\b', line, re.IGNORECASE) or
                 re.search(r'\b(BUSINESS|RISK FACTORS|FINANCIAL STATEMENTS)\b', line, re.IGNORECASE))):
                potential_headers.append((j, line))

        if potential_headers:
            if current_section:
                sections.append(DocumentSection(
                    title=current_title,
                    content=current_section.strip(),
                    section_type='content',
                    start_pos=0,
                    end_pos=len(current_section)
                ))

            current_title = potential_headers[0][1]
            current_section = page
        else:
            current_section += "\n\n" + page

    if current_section:
        sections.append(DocumentSection(
            title=current_title,
            content=current_section.strip(),
            section_type='content',
            start_pos=0,
            end_pos=len(current_section)
        ))

    return sections

def detect_sections_robust_old(content: str) -> List[DocumentSection]:
    """
    Multi-strategy section detection with fallbacks (original version)
    """
    logger.info("Attempting Strategy 1: Regex-based section detection")
    sections = detect_sections_strategy_1_improved(content)

    if len(sections) >= 3:
        logger.info(f"Strategy 1 successful: Found {len(sections)} sections")
        return sections

    logger.warning("Strategy 1 failed, trying Strategy 2: Page-based detection")
    sections = detect_sections_strategy_2(content)

    if len(sections) >= 2:
        logger.info(f"Strategy 2 successful: Found {len(sections)} sections")
        return sections

    logger.warning("All strategies failed, creating single section")
    return [DocumentSection(
        title="Full Document",
        content=content,
        section_type='document',
        start_pos=0,
        end_pos=len(content)
    )]

def create_section_info(section: DocumentSection, form_type: str) -> str:
    """
    Create human-readable section information for DocumentSection objects,
    using form_type to select the correct item name map.
    Handles 10K/10Q specific mappings and part/item inheritance.
    """
    item_number = section.item_number
    section_type = section.section_type
    part_number = section.part

    if section_type == 'item' and item_number:
        if form_type == '10K':
            item_name = ITEM_NAME_MAP_10K.get(item_number, "Unknown Section")
            return f"Item {item_number} - {item_name}"
        elif form_type == '10Q':
            if part_number == 'PART I':
                item_name = ITEM_NAME_MAP_10Q_PART_I.get(item_number, "Unknown Section")
                return f"Part I, Item {item_number} - {item_name}"
            elif part_number == 'PART II':
                item_name = ITEM_NAME_MAP_10Q_PART_II.get(item_number, "Unknown Section")
                return f"Part II, Item {item_number} - {item_name}"
            else:
                if item_number in ITEM_NAME_MAP_10Q_PART_I:
                    item_name = ITEM_NAME_MAP_10Q_PART_I[item_number]
                    return f"Part I, Item {item_number} - {item_name}"
                elif item_number in ITEM_NAME_MAP_10Q_PART_II:
                    item_name = ITEM_NAME_MAP_10Q_PART_II[item_number]
                    return f"Part II, Item {item_number} - {item_name}"
                return f"Item {item_number} - Unknown 10Q Section"
    
    elif section_type == 'part' and part_number:
        if "Item" in section.title and section.item_number:
            clean_title_suffix = section.title.replace(part_number, '').strip(' -.')
            return f"{part_number} - {clean_title_suffix}"
        return part_number

    return section.title or "Document Content"


def detect_sections_universal_sec(content: str) -> List[DocumentSection]:
    """
    Universal section detection for SEC filings with table-based formatting.
    Ensures content for each DocumentSection is correctly sliced.
    """
    sections = []

    if not content:
        logger.info("Empty content provided to detect_sections_universal_sec. Returning empty sections.")
        return sections

    patterns = [
        re.compile(r'(?i)\[TABLE_START\]\s*Item\s*(\d{1,2}[A-C]?)\.?\s*\|\s*([^\[]+?)\s*\[TABLE_END\]', re.DOTALL),
        re.compile(r'(?i)\[TABLE_START\]\s*Item\s*(\d{1,2}[A-C]?)\.?\s*\|\s*([^|]+)', re.DOTALL),
        re.compile(r'(?i)\[TABLE_START\]\s*PART\s*([IVX]+)\s*\|\s*([^\[]+?)\s*\[TABLE_END\]', re.DOTALL),
        re.compile(r'(?i)\[TABLE_START\]\s*PART\s*([IVX]+)\s*\|\s*([^|]+)', re.DOTALL),
        re.compile(r'(?i)\[TABLE_START\]\s*PART\s*([IVX]+)\s*\[TABLE_END\]', re.DOTALL),
        re.compile(r'^\s*Item\s*(\d{1,2}[A-C]?)\.?\s*([^\n]+)', re.I | re.M),
        re.compile(r'Item\s*(\d{1,2}[A-C]?)\.?\s*\|\s*([^|]+)', re.I | re.DOTALL),
        re.compile(r'^\s*PART\s*([IVX]+)\.?\s*([^\n]*)', re.I | re.M),
        re.compile(r'PART\s*([IVX]+)\s*\|\s*([^|]+)', re.I | re.DOTALL),
        re.compile(r'^\s*(\d{1,2}[A-C]?)\.\s+[A-Z][A-Za-z\s]{10,}', re.I | re.M),
        re.compile(r'(?i)\[TABLE_START\]\s*(\d{1,2}[A-C]?)\.?\s*\|\s*([^|]+)', re.I | re.DOTALL),
        re.compile(r'^\s*(BUSINESS|RISK FACTORS|LEGAL PROCEEDINGS|FINANCIAL STATEMENTS|MANAGEMENT\'S DISCUSSION AND ANALYSIS OF FINANCIAL CONDITION AND RESULTS OF OPERATIONS|PROPERTIES|CONTROLS AND PROCEDURES)\s*$', re.I | re.M)
    ]

    all_matches = []

    for pattern_idx, pattern in enumerate(patterns):
        for match in pattern.finditer(content):
            line_start = content.rfind('\n', 0, match.start()) + 1
            line_end = content.find('\n', match.end())
            if line_end == -1:
                line_end = len(content)

            full_line = content[line_start:line_end].strip()

            if (len(full_line) > 400 or
                len(full_line) < 3 or
                ('TABLE' in full_line.upper() and ('START' in full_line.upper() or 'END' in full_line.upper())) or
                full_line.count(' ') > 20):
                continue

            if any(toc_indicator in full_line.lower() for toc_indicator in ['table of contents', 'index']):
                continue
            
            section_id = None
            section_title = full_line

            groups = match.groups()
            if groups:
                potential_id = groups[0].strip()
                is_item_id = re.match(r'^\d+[A-C]?$', potential_id, re.I)
                is_part_id = re.match(r'^[IVX]+$', potential_id, re.I)

                if is_item_id or is_part_id:
                    section_id = potential_id
                    if len(groups) > 1 and groups[1]:
                        section_title = groups[1].strip()
                        section_title = re.sub(r'\[TABLE_END\]\s*.*', '', section_title, flags=re.I).strip()
                        section_title = section_title.replace('|', '').strip()
                    else:
                        remaining_line_after_id = full_line[match.end() - line_start:].strip()
                        clean_line = re.sub(r'^\s*\.?\s*[-–—]?\s*', '', remaining_line_after_id).strip()
                        if clean_line and len(clean_line) < 200:
                            section_title = clean_line
                        else:
                             section_title = full_line
                else:
                    section_title = full_line
                    if 'BUSINESS' in full_line.upper() and not is_item_id and not is_part_id: section_id = '1'
                    elif 'RISK FACTORS' in full_line.upper() and not is_item_id and not is_part_id: section_id = '1A'

            all_matches.append({
                'start_pos': match.start(),
                'end_pos': match.end(),
                'full_line': full_line,
                'section_id': section_id if section_id else 'unknown',
                'section_title': section_title,
                'pattern_idx': pattern_idx,
                'match_start': match.start()
            })

    all_matches.sort(key=lambda x: (x['start_pos'], x['pattern_idx']))

    final_matches = []
    if all_matches:
        final_matches.append(all_matches[0])
        for i in range(1, len(all_matches)):
            current_match = all_matches[i]
            last_added_match = final_matches[-1]

            if current_match['start_pos'] - last_added_match['start_pos'] < 100:
                if current_match['section_id'] != 'unknown' and last_added_match['section_id'] == 'unknown':
                    final_matches[-1] = current_match
                elif current_match['section_id'] != 'unknown' and last_added_match['section_id'] != 'unknown' and current_match['pattern_idx'] < last_added_match['pattern_idx']:
                    final_matches[-1] = current_match
                elif current_match['section_id'] == last_added_match['section_id'] and len(current_match['section_title']) < len(last_added_match['section_title']) * 0.8:
                     final_matches[-1] = current_match
            else:
                final_matches.append(current_match)

    logger.info(f"🔍 Universal SEC detection found {len(final_matches)} unique sections:")
    for i, match in enumerate(final_matches[:15]):
        logger.info(f"  {i+1}: Item/Part {match['section_id']} - {match['section_title'][:60]}...")

    final_document_sections = []
    current_part = None

    for i, match in enumerate(final_matches):
        start_pos = match['start_pos']
        end_pos = final_matches[i + 1]['start_pos'] if i + 1 < len(final_matches) else len(content)

        section_content = content[start_pos:end_pos].strip()

        section_id = match['section_id'].upper()
        title = match['section_title']

        section_type = 'content'
        item_number = None
        part = None

        if re.match(r'^[IVX]+$', section_id):
            section_type = 'part'
            part = f"PART {section_id}"
            current_part = part
            clean_title_part = title.upper().replace(part, '').strip(' -.')
            if clean_title_part:
                title = f"{part} - {clean_title_part}"
            else:
                title = part
        elif re.match(r'^\d+[A-C]?$', section_id):
            section_type = 'item'
            item_number = section_id
            part = current_part
            clean_title_item = title.upper().replace(f"ITEM {item_number}", '').strip(' -.')
            if clean_title_item:
                title = f"Item {item_number} - {clean_title_item}"
            else:
                title = f"Item {item_number}"
        elif any(keyword in title.upper() for keyword in ['BUSINESS', 'RISK FACTORS', 'LEGAL PROCEEDINGS', 'FINANCIAL STATEMENTS', 'MANAGEMENT\'S DISCUSSION', 'PROPERTIES', 'CONTROLS AND PROCEDURES']):
            section_type = 'named_section'

        logger.debug(f"Creating DocumentSection: Title='{title}', Type='{section_type}', Item='{item_number}', Part='{part}', Content len: {len(section_content)}, Start: {start_pos}, End: {end_pos}") # Added debug

        final_document_sections.append(DocumentSection(
            title=title,
            content=section_content,
            section_type=section_type,
            item_number=item_number,
            part=part,
            start_pos=start_pos,
            end_pos=end_pos
        ))

    return final_document_sections

def detect_sections_from_toc_universal(content: str) -> List[DocumentSection]:
    """
    Extract sections from table of contents - works for any SEC filing.
    This function primarily identifies section titles and item numbers from TOC,
    but does not extract their content directly.
    """
    sections = []

    if not content:
        logger.info("Empty content provided to detect_sections_from_toc_universal. Returning empty sections.")
        return sections

    toc_patterns = [
        re.compile(r'(?i)INDEX.*?(?=\s*--- PAGE BREAK ---)', re.DOTALL),
        re.compile(r'(?i)TABLE OF CONTENTS.*?(?=\s*--- PAGE BREAK ---)', re.DOTALL),
        re.compile(r'(?i)FORM 10-[KQ].*?INDEX.*?(?=\s*--- PAGE BREAK ---)', re.DOTALL),
        re.compile(re.escape('[TABLE_START]') + r'.*?Page.*?' + re.escape('[TABLE_END]') + r'.*?(?=\s*--- PAGE BREAK ---)', re.DOTALL),
    ]

    toc_content = ""
    for pattern in toc_patterns:
        match = pattern.search(content)
        if match:
            toc_content = match.group(0)
            break

    if not toc_content:
        logger.warning("No table of contents found in detect_sections_from_toc_universal.")
        return sections

    logger.info(f"Found table of contents ({len(toc_content)} chars)")

    # Define patterns for items/parts within the TOC.
    item_patterns = [
        # Pattern 1: Multi-column TOC entry with PART, Item, and Title (e.g., KO 10-Q)
        # Captures: (Optional Page Num) | PART ID | PART Title (Optional) | Item ID | Item Title (Optional) | Page Num
        re.compile(r'(?i)(?:Page\s*\|\s*)?\s*(PART\s*([IVX]+)\.?(?:\s*([^\n|]+?))?\s*\|\s*)?Item\s*(\d{1,2}[A-C]?)\.?\s*\|\s*([^|]+?)(?:\s*\|\s*\d+)?', re.M),
        
        # Pattern 2: Simpler Item/Part line with Title, pipe-separated. Catches "Item 1. | Financial Statements | 3"
        re.compile(r'(?i)(?:Item|PART)\s*(\d{1,2}[A-C]?|[IVX]+)\.?\s*\|\s*([^\n|]+?)(?:\s*\|\s*\d+)?', re.M),
        
        # Pattern 3: Standalone Item/Part line with Title (no pipes separating title)
        re.compile(r'(?i)^\s*(?:Item|PART)\s*(\d{1,2}[A-C]?|[IVX]+)\.?\s*([^\n|]+)', re.M),
        
        # Pattern 4: Generic TOC titles, often sub-sections or long descriptions. Must be long enough, starts with capital.
        re.compile(r'^\s*([A-Z][A-Za-z0-9\s\',&\(\)\-\.]{15,})\s*(?:\|\s*\d+)?$', re.M),
        
        # Pattern 5: Simple "PART X" line
        re.compile(r'(?i)^\s*PART\s*([IVX]+)\s*$', re.M),
        
        # Pattern 6: Number-dot format (e.g., "1. Business") usually at start of line
        re.compile(r'^\s*(\d{1,2}[A-C]?)\.\s*([^\n|]+)', re.M),
    ]

    found_items = []
    current_part_id_context = None

    if toc_content:
        for line in toc_content.split('\n'):
            line = line.strip()
            if not line:
                continue
            
            # Strict filtering of TOC lines to remove noise
            if any(kw in line.lower() for kw in ['page', 'signatures', 'exhibit', 'index', 'table of contents']) and len(line) < 30:
                continue
            if re.match(r'^\s*\d+\s*$', line.strip()): # Just a page number
                continue
            if re.match(r'^\s*(\d{1,2}[A-C]?)\s*$', line.strip()): # Just "1" or "1A"
                continue
            if len(line) < 5: # Very short lines
                continue
            if re.search(r'\d+\s*$', line.strip()) and not re.match(r'(?:Item|PART)\s*(\d{1,2}[A-C]?|[IVX]+)\.?', line, re.I): # Looks like page number at end, but not a clear item/part line
                continue

            for pattern in item_patterns:
                match = pattern.search(line)
                if match:
                    item_id = None
                    item_title = ""
                    section_type_raw = 'unknown'

                    if pattern == item_patterns[0]: # Pattern 1: Complex multi-column TOC
                        part_id_cand = match.group(2) if len(match.groups()) >= 2 else None
                        part_title_from_group = match.group(3) if len(match.groups()) >= 3 else None
                        item_id = match.group(4).strip() if len(match.groups()) >= 4 else None
                        item_title = match.group(5).strip() if len(match.groups()) >= 5 else ""
                        
                        if part_id_cand:
                            current_part_id_context = f"PART {part_id_cand.strip()}"
                            title_for_part = part_title_from_group.strip() if part_title_from_group else f"PART {part_id_cand.strip()}"
                            found_items.append((part_id_cand.strip(), title_for_part, 'part', current_part_id_context))
                        
                        if item_id:
                            section_type_raw = 'item'
                            title_for_item = item_title.strip() if item_title else f"Item {item_id.strip()}"
                            found_items.append((item_id.strip(), title_for_item, section_type_raw, current_part_id_context))
                            break

                    elif pattern in [item_patterns[1], item_patterns[2], item_patterns[5]]: # Patterns with ID as group 1, Title as group 2 (or inferred from line)
                        item_id = match.group(1).strip() if match.group(1) else None
                        item_title = match.group(2).strip() if len(match.groups()) > 1 and match.group(2) else ""

                        is_item = re.match(r'^\d+[A-C]?$', item_id, re.I)
                        is_part = re.match(r'^[IVX]+$', item_id, re.I)

                        if is_item:
                            section_type_raw = 'item'
                            found_items.append((item_id, item_title, section_type_raw, current_part_id_context))
                            break
                        elif is_part:
                            section_type_raw = 'part'
                            current_part_id_context = f"PART {item_id}"
                            found_items.append((item_id, item_title, section_type_raw, current_part_id_context))
                            break
                    
                    elif pattern == item_patterns[3]: # Generic titles (Pattern 4: e.g., "Consolidated Statements of Cash Flows")
                        item_title = match.group(1).strip()
                        if item_title and len(item_title) > 10 and not re.match(r'^\d+(\.\d+)?$', item_title.replace('.', '').strip()):
                             found_items.append((None, item_title, 'named_section', current_part_id_context))
                             break
                    
                    elif pattern == item_patterns[4]: # Simple "PART X" line (Pattern 5)
                        item_id = match.group(1).strip()
                        current_part_id_context = f"PART {item_id}"
                        found_items.append((item_id, f"PART {item_id}", 'part', current_part_id_context))
                        break

    unique_items = []
    seen_keys = set()
    
    processed_items_for_dedup = []
    for item_data in found_items:
        item_id, title_raw, section_type_raw, part_context = item_data
        
        cleaned_title = re.sub(r'\|\s*\d+\s*$', '', title_raw).strip()
        cleaned_title = re.sub(r'\s*\.\s*$', '', cleaned_title).strip()
        cleaned_title = re.sub(r'\[TABLE_END\]\s*.*', '', cleaned_title, flags=re.I).strip()
        cleaned_title = re.sub(r'\s+', ' ', cleaned_title).strip()
        
        if not cleaned_title or len(cleaned_title) < 5 or re.match(r'^\d+(\.\d+)?$', cleaned_title):
            continue

        processed_items_for_dedup.append({
            'item_id': item_id,
            'title': cleaned_title,
            'type': section_type_raw,
            'part': part_context
        })

    processed_items_for_dedup.sort(key=lambda x: (x['part'] if x['part'] else '', x['item_id'] if x['item_id'] else '', x['title']))

    for item in processed_items_for_dedup:
        key = (item['item_id'], item['title'], item['type'], item['part'])
        if key not in seen_keys:
            unique_items.append(DocumentSection(
                title=item['title'],
                content="",
                section_type=item['type'],
                item_number=item['item_id'] if item['type'] == 'item' else None,
                part=item['part'],
                start_pos=0,
                end_pos=0
            ))
            seen_keys.add(key)
    
    logger.info(f"Extracted {len(unique_items)} sections from table of contents:")
    for i, sec in enumerate(unique_items[:15]):
        logger.info(f"  • ID: {sec.item_number if sec.item_number else sec.part if sec.part else 'None'}, Type: {sec.section_type}, Title: {sec.title[:60]}...")

    return unique_items


def detect_sections_robust_universal(content: str) -> List[DocumentSection]:
    """
    Universal robust section detection for all SEC filings.
    Prioritizes direct pattern matching (which handles tables well), then TOC, then page-based.
    """
    logger.info("Attempting universal SEC section detection")

    sections_strategy1 = detect_sections_universal_sec(content)

    if len(sections_strategy1) >= 3:
        logger.info(f"Universal detection successful (Strategy 1): Found {len(sections_strategy1)} sections.")
        return sections_strategy1

    logger.warning("Direct detection found few sections, analyzing table of contents.")
    toc_entries = detect_sections_from_toc_universal(content)

    if toc_entries and len(toc_entries) >= 3:
        logger.info(f"TOC analysis found {len(toc_entries)} potential sections. Attempting to extract content based on TOC titles.")

        combined_sections = []
        current_content_pos = 0

        for i, toc_entry in enumerate(toc_entries):
            pattern_parts = []
            
            if toc_entry.item_number:
                pattern_parts.append(r'Item\s*' + re.escape(toc_entry.item_number) + r'\.?')
            if toc_entry.part and toc_entry.part.startswith("PART "):
                pattern_parts.append(r'PART\s*' + re.escape(toc_entry.part.replace("PART ", "")) + r'\.?')
            
            if toc_entry.title:
                cleaned_title_for_regex = re.sub(r'\|\s*\d+', '', toc_entry.title).strip()
                cleaned_title_for_regex = re.sub(r'\s*\.\s*$', '', cleaned_title_for_regex).strip()
                cleaned_title_for_regex = re.sub(r'\s+-\s+', r'\s*[-–—]?\s*', cleaned_title_for_regex)
                cleaned_title_for_regex = re.sub(r'\s+', r'\s+', cleaned_title_for_regex)
                
                if len(cleaned_title_for_regex) > 5:
                    pattern_parts.append(r'\b?' + re.escape(cleaned_title_for_regex) + r'\b?')
                else:
                    pattern_parts.append(re.escape(cleaned_title_for_regex))
                
            if not pattern_parts:
                logger.warning(f"No valid pattern parts for TOC entry: '{toc_entry.title}'. Skipping.")
                continue

            search_pattern = re.compile(r'(?i)^\s*(?:' + '|'.join(pattern_parts) + r')', re.M)
            
            match = search_pattern.search(content, pos=current_content_pos)

            if match:
                start_pos = match.start()
                
                next_start_pos = len(content)
                if i + 1 < len(toc_entries):
                    next_toc_entry = toc_entries[i+1]
                    next_pattern_parts = []
                    if next_toc_entry.item_number:
                        next_pattern_parts.append(r'Item\s*' + re.escape(next_toc_entry.item_number) + r'\.?')
                    elif next_toc_entry.part and next_toc_entry.part.startswith("PART "):
                        next_pattern_parts.append(r'PART\s*' + re.escape(next_toc_entry.part.replace("PART ", "")) + r'\.?')
                    if next_toc_entry.title:
                        next_cleaned_title_for_regex = re.sub(r'\|\s*\d+', '', next_toc_entry.title).strip()
                        next_cleaned_title_for_regex = re.sub(r'\s*\.\s*$', '', next_cleaned_title_for_regex).strip()
                        next_cleaned_title_for_regex = re.sub(r'\s+-\s+', r'\s*[-–—]?\s*', next_cleaned_title_for_regex)
                        next_cleaned_title_for_regex = re.sub(r'\s+', r'\s+', next_cleaned_title_for_regex)
                        if len(next_cleaned_title_for_regex) > 5:
                            next_pattern_parts.append(r'\b?' + re.escape(next_cleaned_title_for_regex) + r'\b?')
                        else:
                            next_pattern_parts.append(re.escape(next_cleaned_title_for_regex))

                    if next_pattern_parts:
                        next_pattern = re.compile(r'(?i)^\s*(?:' + '|'.join(next_pattern_parts) + r')', re.M)
                        next_match = next_pattern.search(content, pos=match.end())
                        if next_match:
                            next_start_pos = next_match.start()
                
                section_content = content[start_pos:next_start_pos].strip()
                
                combined_sections.append(DocumentSection(
                    title=toc_entry.title,
                    content=section_content,
                    section_type=toc_entry.section_type,
                    item_number=toc_entry.item_number,
                    part=toc_entry.part,
                    start_pos=start_pos,
                    end_pos=next_start_pos
                ))
                current_content_pos = next_start_pos
            else:
                logger.warning(f"Could not find content for TOC entry: '{toc_entry.title}'. This section might be merged with previous or skipped.")

        if len(combined_sections) >= 3:
            logger.info(f"Universal detection successful (TOC-based content mapping): Found {len(combined_sections)} sections.")
            return combined_sections
        else:
            logger.warning("TOC-based content mapping yielded few sections. Falling back to page-based detection.")


    logger.warning("Trying page-based detection as fallback.")
    sections_strategy2 = detect_sections_strategy_2(content)

    if len(sections_strategy2) >= 2:
        logger.info(f"Page-based detection successful: Found {len(sections_strategy2)} sections.")
        return sections_strategy2

    logger.warning("All strategies failed, creating single section.")
    return [DocumentSection(
        title="Full Document",
        content=content,
        section_type='document',
        start_pos=0,
        end_pos=len(content)
    )]

In [26]:
# =============================================================================
# Run the fixed tests
# These calls should be at the very end of your notebook or script,
# after all function definitions.
# =============================================================================

print("\n" + "="*80)
print("🚀 Initiating all test suites!")
print("="*80 + "\n")

results_universal = test_universal_detection_fixed()
old_vs_new_sections = compare_old_vs_universal_fixed()
quick_pattern_test_fixed()

print("\n" + "="*80)
print("✅ All test suites completed!")
print("="*80)

INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 19 unique sections:
INFO:__main__:  1: Item/Part 1 - Business...
INFO:__main__:  2: Item/Part 1A - Risk Factors...
INFO:__main__:  3: Item/Part 1B - Unresolved Staff Comments...
INFO:__main__:  4: Item/Part 3 - Legal Proceedings...
INFO:__main__:  5: Item/Part 4 - Mine Safety Disclosures...
INFO:__main__:  6: Item/Part 5 - Market for Registrant’s Common Equity, Related Stockholder M...
INFO:__main__:  7: Item/Part 6 - Selected Financial Data...
INFO:__main__:  8: Item/Part 7 - Management’s Discussion and Analysis of Financial Condition ...
INFO:__main__:  9: Item/Part 7A - Quantitative and Qualitative Disclosures About Market Risk...
INFO:__main__:  10: Item/Part 8 - Financial Statements and Supplementary Data...
INFO:__main__:  11: Item/Part 9 - Changes in and Disagreements with Accountants on Accounting ...
INFO:__main__:  12: Item/Part 9A - Controls and Procedures...
INFO:__main__:


🚀 Initiating all test suites!


🧪 Testing: processed_filings/AAPL/AAPL_10K_2020-10-30.txt


✅ Found 19 sections:

  1. Item 1 - BUSINESS

     Type: item, Length: 13,266 chars

  2. Item 1A - RISK FACTORS

     Type: item, Length: 61,136 chars

  3. Item 1B - UNRESOLVED STAFF COMMENTS

     Type: item, Length: 582 chars

  4. Item 3 - LEGAL PROCEEDINGS

     Type: item, Length: 898 chars

  5. Item 4 - MINE SAFETY DISCLOSURES

     Type: item, Length: 108 chars

  6. Item 5 - MARKET FOR REGISTRANT’S COMMON EQUITY, RELATED STOCKHOLDER MATTERS AND ISSUER PURCHASES OF EQUITY SECURITIES

     Type: item, Length: 4,182 chars

  7. Item 6 - SELECTED FINANCIAL DATA

     Type: item, Length: 1,745 chars

  8. Item 7 - MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIAL CONDITION AND RESULTS OF OPERATIONS

     Type: item, Length: 33,154 chars

  9. Item 7A - QUANTITATIVE AND QUALITATIVE DISCLOSURES ABOUT MARKET RISK

     Type: item, Length: 6,799 chars

  10. Item 8 - FINANCIAL STATEMENTS AND 

INFO:__main__:Created 172 chunks for AAPL_10K_2020-10-30.txt
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 21 unique sections:
INFO:__main__:  1: Item/Part 1 - Business...
INFO:__main__:  2: Item/Part 1A - Risk Factors...
INFO:__main__:  3: Item/Part 1B - Unresolved Staff Comments...
INFO:__main__:  4: Item/Part 2 - Properties...
INFO:__main__:  5: Item/Part 3 - Legal Proceedings...
INFO:__main__:  6: Item/Part 4 - Mine Safety Disclosures...
INFO:__main__:  7: Item/Part 5 - Market for the Registrant’s Common Stock, Related Shareholde...
INFO:__main__:  8: Item/Part 6 - Reserved...
INFO:__main__:  9: Item/Part 7A - Quantitative and Qualitative Disclosures About Market Risk...
INFO:__main__:  10: Item/Part 8 - Financial Statements and Supplementary Data...
INFO:__main__:  11: Item/Part unknown - Legal Proceedings...
INFO:__main__:  12: Item/Part 9 - Changes in and Disagreements with Accountants On Accounting ...
INFO:__main__:  13:


📊 Processing Results:

  total_chunks: 172

  avg_tokens: 379.86046511627904

  min_tokens: 38

  max_tokens: 1692

  chunks_with_overlap: 105

  table_chunks: 66

  narrative_chunks: 106

  unique_sections: 1


📚 Section Distribution (sample):

  • Full Document: 20 chunks


🧪 Testing: processed_filings/AMZN/AMZN_10K_2023-02-03.txt


✅ Found 21 sections:

  1. Item 1 - BUSINESS

     Type: item, Length: 13,286 chars

  2. Item 1A - RISK FACTORS

     Type: item, Length: 55,961 chars

  3. Item 1B - UNRESOLVED STAFF COMMENTS

     Type: item, Length: 107 chars

  4. Item 2 - PROPERTIES

     Type: item, Length: 1,438 chars

  5. Item 3 - LEGAL PROCEEDINGS

     Type: item, Length: 186 chars

  6. Item 4 - MINE SAFETY DISCLOSURES

     Type: item, Length: 123 chars

  7. Item 5 - MARKET FOR THE REGISTRANT’S COMMON STOCK, RELATED SHAREHOLDER MATTERS, AND ISSUER PURCHASES OF EQUITY SECURITIES

     Type: item, Length: 508 chars

  8. Item 6 - RESERVED

     Type: item, Length: 50,498 cha

INFO:__main__:Created 161 chunks for KO_10Q_2020-07-22.txt
INFO:__main__:Attempting Strategy 1: Regex-based section detection



📊 Processing Results:

  total_chunks: 161

  avg_tokens: 396.7577639751553

  min_tokens: 32

  max_tokens: 1451

  chunks_with_overlap: 97

  table_chunks: 63

  narrative_chunks: 98

  unique_sections: 1


📚 Section Distribution (sample):

  • Full Document: 20 chunks


📊 UNIVERSAL DETECTION SUMMARY

AAPL_10K_2020-10-30.txt   | 19 sections | 172 chunks

AMZN_10K_2023-02-03.txt   | 21 sections |   0 chunks

AMZN_10Q_2024-11-01.txt   | 11 sections |   0 chunks

KO_10Q_2020-07-22.txt     |  8 sections | 161 chunks

⚖️ OLD vs UNIVERSAL Detection Comparison

Running old detection...



TypeError: DocumentSection.__init__() got an unexpected keyword argument 'id'

In [27]:
import os
import re
import pandas as pd
import tiktoken
from typing import List, Dict, Any, Tuple, Optional
from dataclasses import dataclass
from datetime import datetime
import logging
from pathlib import Path

# Set up logging to see what's happening
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Initialize tokenizer for accurate token counting
encoding = tiktoken.encoding_for_model("text-embedding-3-small")

# =============================================================================
# 1. SEC MAPPINGS WITH FALLBACKS
# =============================================================================

ITEM_NAME_MAP_10K = {
    "1": "Business",
    "1A": "Risk Factors",
    "1B": "Unresolved Staff Comments",
    "1C": "Cybersecurity",
    "2": "Properties",
    "3": "Legal Proceedings",
    "4": "Mine Safety Disclosures",
    "5": "Market for Registrant's Common Equity, Related Stockholder Matters and Issuer Purchases of Equity Securities",
    "6": "Reserved",
    "7": "Management's Discussion and Analysis of Financial Condition and Results of Operations",
    "7A": "Quantitative and Qualitative Disclosures About Market Risk",
    "8": "Financial Statements and Supplementary Data",
    "9": "Changes in and Disagreements With Accountants on Accounting and Financial Disclosure",
    "9A": "Controls and Procedures",
    "9B": "Other Information",
    "9C": "Disclosure Regarding Foreign Jurisdictions that Prevent Inspections",
    "10": "Directors, Executive Officers and Corporate Governance",
    "11": "Executive Compensation",
    "12": "Security Ownership of Certain Beneficial Owners and Management and Related Stockholder Matters",
    "13": "Certain Relationships and Related Transactions, and Director Independence",
    "14": "Principal Accountant Fees and Services",
    "15": "Exhibits, Financial Statement Schedules",
    "16": "Form 10-K Summary"
}

ITEM_NAME_MAP_10Q_PART_I = {
    "1": "Financial Statements",
    "2": "Management's Discussion and Analysis of Financial Condition and Results of Operations",
    "3": "Quantitative and Qualitative Disclosures About Market Risk",
    "4": "Controls and Procedures",
}

ITEM_NAME_MAP_10Q_PART_II = {
    "1": "Legal Proceedings", "1A": "Risk Factors",
    "2": "Unregistered Sales of Equity Securities and Use of Proceeds",
    "3": "Defaults Upon Senior Securities", "4": "Mine Safety Disclosures",
    "5": "Other Information", "6": "Exhibits",
}

# =============================================================================
# 2. DATA STRUCTURES FOR BETTER ORGANIZATION
# =============================================================================

@dataclass
class FilingMetadata:
    """Structured metadata for a filing"""
    ticker: str
    form_type: str
    filing_date: str
    fiscal_year: int
    fiscal_quarter: int
    file_path: str

@dataclass
class DocumentSection:
    """Represents a section of the document"""
    title: str
    content: str
    section_type: str  # 'item', 'part', 'intro', 'table'
    item_number: Optional[str] = None
    part: Optional[str] = None
    start_pos: int = 0
    end_pos: int = 0

@dataclass
class Chunk:
    """Final chunk with all metadata"""
    chunk_id: str
    text: str
    token_count: int
    chunk_type: str  # 'narrative', 'table', 'mixed'
    section_info: str
    filing_metadata: FilingMetadata
    chunk_index: int
    has_overlap: bool = False

# =============================================================================
# 3. ROBUST TEXT CLEANING
# =============================================================================

def clean_sec_text(text: str) -> str:
    """
    Clean SEC filing text more robustly
    """
    # Remove common SEC artifacts
    text = re.sub(r'UNITED STATES\s+SECURITIES AND EXCHANGE COMMISSION.*?FORM \d+[A-Z]*', '', text, flags=re.DOTALL | re.IGNORECASE)

    # Handle page breaks more intelligently
    text = text.replace('[PAGE BREAK]', '\n\n--- PAGE BREAK ---\n\n')

    # Preserve table boundaries but clean them up
    text = re.sub(r'\[TABLE_START\]', '\n\n=== TABLE START ===\n', text)
    text = re.sub(r'\[TABLE_END\]', '\n=== TABLE END ===\n\n', text)

    # Clean up excessive whitespace but preserve paragraph structure
    text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text)  # Multiple newlines -> double newline
    text = re.sub(r'[ \t]+', ' ', text)  # Multiple spaces/tabs -> single space
    text = re.sub(r'^\s+|\s+$', '', text, flags=re.MULTILINE)  # Trim lines

    return text.strip()

# =============================================================================
# 4. MULTI-STRATEGY SECTION DETECTION
# =============================================================================

def detect_sections_strategy_1_improved(content: str) -> List[DocumentSection]:
    """
    Improved Strategy 1: Patterns based on real SEC filing structure
    """
    sections = []

    # Much more comprehensive patterns based on your actual files
    patterns = [
        # PART patterns - handle various formats
        re.compile(r'^\s*PART\s+([IVX]+)(?:\s*[-–—].*?)?$', re.I | re.M),
        re.compile(r'^PART\s+([IVX]+)(?:\s*[-–—].*?)?$', re.I | re.M),

        # ITEM patterns - much more flexible
        re.compile(r'^\s*ITEM\s+(\d{1,2}[A-C]?)(?:[.\s–—])', re.I | re.M),
        re.compile(r'^ITEM\s+(\d{1,2}[A-C]?)(?:[.\s–—])', re.I | re.M),
        re.compile(r'Item\s+(\d{1,2}[A-C]?)(?:[.\s–—])', re.I | re.M),

        # Number-dot format common in SEC filings
        re.compile(r'^(\d{1,2}[A-C]?)\.\s+[A-Z][A-Za-z\s]{10,}', re.I | re.M),

        # Content-based patterns for known sections
        re.compile(r'^.{0,50}(BUSINESS)\s*$', re.I | re.M),
        re.compile(r'^.{0,50}(RISK FACTORS)\s*$', re.I | re.M),
        re.compile(r'^.{0,50}(LEGAL PROCEEDINGS)\s*$', re.I | re.M),
        re.compile(r'^.{0,50}(FINANCIAL STATEMENTS)\s*$', re.I | re.M),
        re.compile(r'^.{0,50}(MANAGEMENT.S DISCUSSION)\s*', re.I | re.M),
        re.compile(r'^.{0,50}(PROPERTIES)\s*$', re.I | re.M),
        re.compile(r'^.{0,50}(CONTROLS AND PROCEDURES)\s*$', re.I | re.M),
    ]

    all_matches = []

    for pattern_idx, pattern in enumerate(patterns):
        for match in pattern.finditer(content):
            line_start = content.rfind('\n', 0, match.start()) + 1
            line_end = content.find('\n', match.end())
            if line_end == -1:
                line_end = len(content)

            full_line = content[line_start:line_end].strip()

            if (len(full_line) > 400 or
                len(full_line) < 3 or
                ('TABLE' in full_line.upper() and ('START' in full_line.upper() or 'END' in full_line.upper())) or
                full_line.count(' ') > 20):
                continue

            if any(toc_indicator in full_line.lower() for toc_indicator in ['table of contents', 'index']):
                continue
            
            section_id = None
            section_title = full_line

            groups = match.groups()
            if groups:
                potential_id = groups[0].strip()
                is_item_id = re.match(r'^\d+[A-C]?$', potential_id, re.I)
                is_part_id = re.match(r'^[IVX]+$', potential_id, re.I)

                if is_item_id or is_part_id:
                    section_id = potential_id
                    if len(groups) > 1 and groups[1]:
                        section_title = groups[1].strip()
                        section_title = re.sub(r'\[TABLE_END\]\s*.*', '', section_title, flags=re.I).strip()
                        section_title = section_title.replace('|', '').strip()
                    else:
                        remaining_line_after_id = full_line[match.end() - line_start:].strip()
                        clean_line = re.sub(r'^\s*\.?\s*[-–—]?\s*', '', remaining_line_after_id).strip()
                        if clean_line and len(clean_line) < 200:
                            section_title = clean_line
                        else:
                             section_title = full_line
                else:
                    section_title = full_line
                    if 'BUSINESS' in full_line.upper() and not is_item_id and not is_part_id: section_id = '1'
                    elif 'RISK FACTORS' in full_line.upper() and not is_item_id and not is_part_id: section_id = '1A'

            all_matches.append({
                'start_pos': match.start(),
                'end_pos': match.end(),
                'full_line': full_line,
                'section_id': section_id if section_id else 'unknown',
                'section_title': section_title,
                'pattern_idx': pattern_idx,
                'match_start': match.start()
            })

    all_matches.sort(key=lambda x: (x['start_pos'], x['pattern_idx']))

    unique_matches = []
    if all_matches:
        unique_matches.append(all_matches[0])
        for i in range(1, len(all_matches)):
            current_match = all_matches[i]
            last_added_match = unique_matches[-1] # Fix: Use unique_matches, not final_matches

            if current_match['start_pos'] - last_added_match['start_pos'] < 100:
                if current_match['section_id'] != 'unknown' and last_added_match['section_id'] == 'unknown':
                    unique_matches[-1] = current_match
                elif current_match['section_id'] != 'unknown' and last_added_match['section_id'] != 'unknown' and current_match['pattern_idx'] < last_added_match['pattern_idx']:
                    unique_matches[-1] = current_match
                elif current_match['section_id'] == last_added_match['section_id'] and len(current_match['section_title']) < len(last_added_match['section_title']) * 0.8:
                     unique_matches[-1] = current_match
            else:
                unique_matches.append(current_match)

    print(f"🔍 Improved detection found {len(unique_matches)} potential sections:")
    for i, match in enumerate(unique_matches[:15]):
        print(f"  {i+1}: {match['full_line'][:80]}...")

    sections_to_return = [] # Renamed to avoid conflict with outer 'sections'
    current_part = None

    for i, match in enumerate(unique_matches):
        start_pos = match['start_pos']
        end_pos = unique_matches[i + 1]['start_pos'] if i + 1 < len(unique_matches) else len(content)

        section_content = content[start_pos:end_pos].strip()

        full_line_upper = match['full_line'].upper()
        section_id = match['section_id'].upper() if match['section_id'] != 'unknown' else None

        section_type = 'content' # Default
        item_number = None
        part = None
        title = match['section_title'] # Use captured title

        if re.match(r'^[IVX]+$', section_id):
            section_type = 'part'
            part = f"PART {section_id}"
            current_part = part # Update for inheritance
            if title.upper().startswith("PART ") and title.upper().replace("PART ", "").strip() == section_id:
                title = part
            elif not title:
                title = part
        elif re.match(r'^\d+[A-C]?$', section_id):
            section_type = 'item'
            item_number = section_id
            part = current_part # Inherit part
            if title.upper().startswith("ITEM ") and title.upper().replace("ITEM ", "").strip() == section_id:
                title = f"Item {item_number}"
            elif not title:
                title = f"Item {item_number}"
        elif any(keyword in full_line_upper for keyword in
                ['BUSINESS', 'RISK', 'LEGAL', 'FINANCIAL', 'MANAGEMENT', 'PROPERTIES', 'CONTROLS']):
            section_type = 'named_section'


        sections_to_return.append(DocumentSection(
            title=title,
            content=section_content,
            section_type=section_type,
            item_number=item_number,
            part=part,
            start_pos=start_pos,
            end_pos=end_pos
        ))

    return sections_to_return


def detect_sections_strategy_2(content: str) -> List[DocumentSection]:
    """
    Strategy 2: Fallback using page breaks and heuristics
    """
    sections = []

    pages = content.split('--- PAGE BREAK ---')

    current_section = ""
    current_title = "Document Content"

    for i, page in enumerate(pages):
        page = page.strip()
        if not page:
            continue

        lines = page.split('\n')
        potential_headers = []

        for j, line in enumerate(lines[:10]):
            line = line.strip()
            if (len(line) < 100 and
                (re.search(r'\b(ITEM|PART)\b', line, re.IGNORECASE) or
                 re.search(r'\b(BUSINESS|RISK FACTORS|FINANCIAL STATEMENTS)\b', line, re.IGNORECASE))):
                potential_headers.append((j, line))

        if potential_headers:
            if current_section:
                sections.append(DocumentSection(
                    title=current_title,
                    content=current_section.strip(),
                    section_type='content',
                    start_pos=0,
                    end_pos=len(current_section)
                ))

            current_title = potential_headers[0][1]
            current_section = page
        else:
            current_section += "\n\n" + page

    if current_section:
        sections.append(DocumentSection(
            title=current_title,
            content=current_section.strip(),
            section_type='content',
            start_pos=0,
            end_pos=len(current_section)
        ))

    return sections

def detect_sections_robust_old(content: str) -> List[DocumentSection]:
    """
    Multi-strategy section detection with fallbacks (original version)
    """
    logger.info("Attempting Strategy 1: Regex-based section detection")
    sections = detect_sections_strategy_1_improved(content)

    if len(sections) >= 3:
        logger.info(f"Strategy 1 successful: Found {len(sections)} sections")
        return sections

    logger.warning("Strategy 1 failed, trying Strategy 2: Page-based detection")
    sections = detect_sections_strategy_2(content)

    if len(sections) >= 2:
        logger.info(f"Strategy 2 successful: Found {len(sections)} sections")
        return sections

    logger.warning("All strategies failed, creating single section")
    return [DocumentSection(
        title="Full Document",
        content=content,
        section_type='document',
        start_pos=0,
        end_pos=len(content)
    )]

def create_section_info(section: DocumentSection, form_type: str) -> str:
    """
    Create human-readable section information for DocumentSection objects,
    using form_type to select the correct item name map.
    Handles 10K/10Q specific mappings and part/item inheritance.
    """
    item_number = section.item_number
    section_type = section.section_type
    part_number = section.part

    if section_type == 'item' and item_number:
        if form_type == '10K':
            item_name = ITEM_NAME_MAP_10K.get(item_number, "Unknown Section")
            return f"Item {item_number} - {item_name}"
        elif form_type == '10Q':
            if part_number == 'PART I':
                item_name = ITEM_NAME_MAP_10Q_PART_I.get(item_number, "Unknown Section")
                return f"Part I, Item {item_number} - {item_name}"
            elif part_number == 'PART II':
                item_name = ITEM_NAME_MAP_10Q_PART_II.get(item_number, "Unknown Section")
                return f"Part II, Item {item_number} - {item_name}"
            else: # Fallback if part not explicitly set for 10Q item
                if item_number in ITEM_NAME_MAP_10Q_PART_I:
                    item_name = ITEM_NAME_MAP_10Q_PART_I[item_number]
                    return f"Part I, Item {item_number} - {item_name}"
                elif item_number in ITEM_NAME_MAP_10Q_PART_II:
                    item_name = ITEM_NAME_MAP_10Q_PART_II[item_number]
                    return f"Part II, Item {item_number} - {item_name}"
                return f"Item {item_number} - Unknown 10Q Section"
    
    elif section_type == 'part' and part_number:
        if "Item" in section.title and section.item_number:
            clean_title_suffix = section.title.replace(part_number, '').strip(' -.')
            return f"{part_number} - {clean_title_suffix}"
        return part_number

    return section.title or "Document Content"


def detect_sections_universal_sec(content: str) -> List[DocumentSection]:
    """
    Universal section detection for SEC filings with table-based formatting.
    Ensures content for each DocumentSection is correctly sliced.
    """
    sections = []

    if not content:
        logger.info("Empty content provided to detect_sections_universal_sec. Returning empty sections.")
        return sections

    patterns = [
        re.compile(r'(?i)\[TABLE_START\]\s*Item\s*(\d{1,2}[A-C]?)\.?\s*\|\s*([^\[]+?)\s*\[TABLE_END\]', re.DOTALL),
        re.compile(r'(?i)\[TABLE_START\]\s*Item\s*(\d{1,2}[A-C]?)\.?\s*\|\s*([^|]+)', re.DOTALL),
        re.compile(r'(?i)\[TABLE_START\]\s*PART\s*([IVX]+)\s*\|\s*([^\[]+?)\s*\[TABLE_END\]', re.DOTALL),
        re.compile(r'(?i)\[TABLE_START\]\s*PART\s*([IVX]+)\s*\|\s*([^|]+)', re.DOTALL),
        re.compile(r'(?i)\[TABLE_START\]\s*PART\s*([IVX]+)\s*\[TABLE_END\]', re.DOTALL),
        re.compile(r'^\s*Item\s*(\d{1,2}[A-C]?)\.?\s*([^\n]+)', re.I | re.M),
        re.compile(r'Item\s*(\d{1,2}[A-C]?)\.?\s*\|\s*([^|]+)', re.I | re.DOTALL),
        re.compile(r'^\s*PART\s*([IVX]+)\.?\s*([^\n]*)', re.I | re.M),
        re.compile(r'PART\s*([IVX]+)\s*\|\s*([^|]+)', re.I | re.DOTALL),
        re.compile(r'^\s*(\d{1,2}[A-C]?)\.\s+[A-Z][A-Za-z\s]{10,}', re.I | re.M),
        re.compile(r'(?i)\[TABLE_START\]\s*(\d{1,2}[A-C]?)\.?\s*\|\s*([^|]+)', re.I | re.DOTALL),
        re.compile(r'^\s*(BUSINESS|RISK FACTORS|LEGAL PROCEEDINGS|FINANCIAL STATEMENTS|MANAGEMENT\'S DISCUSSION AND ANALYSIS OF FINANCIAL CONDITION AND RESULTS OF OPERATIONS|PROPERTIES|CONTROLS AND PROCEDURES)\s*$', re.I | re.M)
    ]

    all_matches = []

    for pattern_idx, pattern in enumerate(patterns):
        for match in pattern.finditer(content):
            line_start = content.rfind('\n', 0, match.start()) + 1
            line_end = content.find('\n', match.end())
            if line_end == -1:
                line_end = len(content)

            full_line = content[line_start:line_end].strip()

            if (len(full_line) > 400 or
                len(full_line) < 3 or
                ('TABLE' in full_line.upper() and ('START' in full_line.upper() or 'END' in full_line.upper())) or
                full_line.count(' ') > 20):
                continue

            if any(toc_indicator in full_line.lower() for toc_indicator in ['table of contents', 'index']):
                continue
            
            section_id = None
            section_title = full_line

            groups = match.groups()
            if groups:
                potential_id = groups[0].strip()
                is_item_id = re.match(r'^\d+[A-C]?$', potential_id, re.I)
                is_part_id = re.match(r'^[IVX]+$', potential_id, re.I)

                if is_item_id or is_part_id:
                    section_id = potential_id
                    if len(groups) > 1 and groups[1]:
                        section_title = groups[1].strip()
                        section_title = re.sub(r'\[TABLE_END\]\s*.*', '', section_title, flags=re.I).strip()
                        section_title = section_title.replace('|', '').strip()
                    else:
                        remaining_line_after_id = full_line[match.end() - line_start:].strip()
                        clean_line = re.sub(r'^\s*\.?\s*[-–—]?\s*', '', remaining_line_after_id).strip()
                        if clean_line and len(clean_line) < 200:
                            section_title = clean_line
                        else:
                             section_title = full_line
                else:
                    section_title = full_line
                    if 'BUSINESS' in full_line.upper() and not is_item_id and not is_part_id: section_id = '1'
                    elif 'RISK FACTORS' in full_line.upper() and not is_item_id and not is_part_id: section_id = '1A'

            all_matches.append({
                'start_pos': match.start(),
                'end_pos': match.end(),
                'full_line': full_line,
                'section_id': section_id if section_id else 'unknown',
                'section_title': section_title,
                'pattern_idx': pattern_idx,
                'match_start': match.start()
            })

    all_matches.sort(key=lambda x: (x['start_pos'], x['pattern_idx']))

    final_matches = []
    if all_matches:
        final_matches.append(all_matches[0])
        for i in range(1, len(all_matches)):
            current_match = all_matches[i]
            last_added_match = final_matches[-1]

            if current_match['start_pos'] - last_added_match['start_pos'] < 100:
                if current_match['section_id'] != 'unknown' and last_added_match['section_id'] == 'unknown':
                    final_matches[-1] = current_match
                elif current_match['section_id'] != 'unknown' and last_added_match['section_id'] != 'unknown' and current_match['pattern_idx'] < last_added_match['pattern_idx']:
                    final_matches[-1] = current_match
                elif current_match['section_id'] == last_added_match['section_id'] and len(current_match['section_title']) < len(last_added_match['section_title']) * 0.8:
                     final_matches[-1] = current_match
            else:
                final_matches.append(current_match)

    logger.info(f"🔍 Universal SEC detection found {len(final_matches)} unique sections:")
    for i, match in enumerate(final_matches[:15]):
        logger.info(f"  {i+1}: Item/Part {match['section_id']} - {match['section_title'][:60]}...")

    final_document_sections = []
    current_part = None

    for i, match in enumerate(final_matches):
        start_pos = match['start_pos']
        end_pos = final_matches[i + 1]['start_pos'] if i + 1 < len(final_matches) else len(content)

        section_content = content[start_pos:end_pos].strip()

        section_id = match['section_id'].upper()
        title = match['section_title']

        section_type = 'content'
        item_number = None
        part = None

        if re.match(r'^[IVX]+$', section_id):
            section_type = 'part'
            part = f"PART {section_id}"
            current_part = part
            clean_title_part = title.upper().replace(part, '').strip(' -.')
            if clean_title_part:
                title = f"{part} - {clean_title_part}"
            else:
                title = part
        elif re.match(r'^\d+[A-C]?$', section_id):
            section_type = 'item'
            item_number = section_id
            part = current_part
            clean_title_item = title.upper().replace(f"ITEM {item_number}", '').strip(' -.')
            if clean_title_item:
                title = f"Item {item_number} - {clean_title_item}"
            else:
                title = f"Item {item_number}"
        elif any(keyword in title.upper() for keyword in ['BUSINESS', 'RISK FACTORS', 'LEGAL PROCEEDINGS', 'FINANCIAL STATEMENTS', 'MANAGEMENT\'S DISCUSSION', 'PROPERTIES', 'CONTROLS AND PROCEDURES']):
            section_type = 'named_section'

        logger.debug(f"Creating DocumentSection: Title='{title}', Type='{section_type}', Item='{item_number}', Part='{part}', Content len: {len(section_content)}, Start: {start_pos}, End: {end_pos}")

        final_document_sections.append(DocumentSection(
            title=title,
            content=section_content,
            section_type=section_type,
            item_number=item_number,
            part=part,
            start_pos=start_pos,
            end_pos=end_pos
        ))

    return final_document_sections

def detect_sections_strategy_2(content: str) -> List[DocumentSection]:
    """
    Strategy 2: Fallback using page breaks and heuristics
    """
    sections = []

    pages = content.split('--- PAGE BREAK ---')

    current_section = ""
    current_title = "Document Content"

    for i, page in enumerate(pages):
        page = page.strip()
        if not page:
            continue

        lines = page.split('\n')
        potential_headers = []

        for j, line in enumerate(lines[:10]):
            line = line.strip()
            if (len(line) < 100 and
                (re.search(r'\b(ITEM|PART)\b', line, re.IGNORECASE) or
                 re.search(r'\b(BUSINESS|RISK FACTORS|FINANCIAL STATEMENTS)\b', line, re.IGNORECASE))):
                potential_headers.append((j, line))

        if potential_headers:
            if current_section:
                sections.append(DocumentSection(
                    title=current_title,
                    content=current_section.strip(),
                    section_type='content',
                    start_pos=0,
                    end_pos=len(current_section)
                ))

            current_title = potential_headers[0][1]
            current_section = page
        else:
            current_section += "\n\n" + page

    if current_section:
        sections.append(DocumentSection(
            title=current_title,
            content=current_section.strip(),
            section_type='content',
            start_pos=0,
            end_pos=len(current_section)
        ))

    return sections

def detect_sections_robust_old(content: str) -> List[DocumentSection]:
    """
    Multi-strategy section detection with fallbacks (original version)
    """
    logger.info("Attempting Strategy 1: Regex-based section detection")
    sections = detect_sections_strategy_1_improved(content) # Fixed argument name for DocumentSection constructor

    if len(sections) >= 3:
        logger.info(f"Strategy 1 successful: Found {len(sections)} sections")
        return sections

    logger.warning("Strategy 1 failed, trying Strategy 2: Page-based detection")
    sections = detect_sections_strategy_2(content)

    if len(sections) >= 2:
        logger.info(f"Strategy 2 successful: Found {len(sections)} sections")
        return sections

    logger.warning("All strategies failed, creating single section")
    return [DocumentSection(
        title="Full Document",
        content=content,
        section_type='document',
        start_pos=0,
        end_pos=len(content)
    )]

def create_section_info(section: DocumentSection, form_type: str) -> str:
    """
    Create human-readable section information for DocumentSection objects,
    using form_type to select the correct item name map.
    Handles 10K/10Q specific mappings and part/item inheritance.
    """
    item_number = section.item_number
    section_type = section.section_type
    part_number = section.part

    if section_type == 'item' and item_number:
        if form_type == '10K':
            item_name = ITEM_NAME_MAP_10K.get(item_number, "Unknown Section")
            return f"Item {item_number} - {item_name}"
        elif form_type == '10Q':
            if part_number == 'PART I':
                item_name = ITEM_NAME_MAP_10Q_PART_I.get(item_number, "Unknown Section")
                return f"Part I, Item {item_number} - {item_name}"
            elif part_number == 'PART II':
                item_name = ITEM_NAME_MAP_10Q_PART_II.get(item_number, "Unknown Section")
                return f"Part II, Item {item_number} - {item_name}"
            else:
                if item_number in ITEM_NAME_MAP_10Q_PART_I:
                    item_name = ITEM_NAME_MAP_10Q_PART_I[item_number]
                    return f"Part I, Item {item_number} - {item_name}"
                elif item_number in ITEM_NAME_MAP_10Q_PART_II:
                    item_name = ITEM_NAME_MAP_10Q_PART_II[item_number]
                    return f"Part II, Item {item_number} - {item_name}"
                return f"Item {item_number} - Unknown 10Q Section"
    
    elif section_type == 'part' and part_number:
        if "Item" in section.title and section.item_number:
            clean_title_suffix = section.title.replace(part_number, '').strip(' -.')
            return f"{part_number} - {clean_title_suffix}"
        return part_number

    return section.title or "Document Content"


def detect_sections_from_toc_universal(content: str) -> List[DocumentSection]:
    """
    Extract sections from table of contents - works for any SEC filing.
    This function primarily identifies section titles and item numbers from TOC,
    but does not extract their content directly.
    """
    sections = []

    if not content:
        logger.info("Empty content provided to detect_sections_from_toc_universal. Returning empty sections.")
        return sections

    toc_patterns = [
        re.compile(r'(?i)INDEX.*?(?=\s*--- PAGE BREAK ---)', re.DOTALL),
        re.compile(r'(?i)TABLE OF CONTENTS.*?(?=\s*--- PAGE BREAK ---)', re.DOTALL),
        re.compile(r'(?i)FORM 10-[KQ].*?INDEX.*?(?=\s*--- PAGE BREAK ---)', re.DOTALL),
        re.compile(re.escape('[TABLE_START]') + r'.*?Page.*?' + re.escape('[TABLE_END]') + r'.*?(?=\s*--- PAGE BREAK ---)', re.DOTALL),
    ]

    toc_content = ""
    for pattern in toc_patterns:
        match = pattern.search(content)
        if match:
            toc_content = match.group(0)
            break

    if not toc_content:
        logger.warning("No table of contents found in detect_sections_from_toc_universal.")
        return sections

    logger.info(f"Found table of contents ({len(toc_content)} chars)")

    item_patterns = [
        # Pattern 1: Multi-column TOC entry with PART, Item, and Title (e.g., KO 10-Q)
        # Group 1: Optional Page Num | Part ID (Group 2) | Part Title (Group 3) | Item ID (Group 4) | Item Title (Group 5)
        re.compile(r'(?i)(?:Page\s*\|\s*)?\s*(PART\s*([IVX]+)\.?(?:\s*([^\n|]+?))?\s*\|\s*)?Item\s*(\d{1,2}[A-C]?)\.?\s*\|\s*([^|]+?)(?:\s*\|\s*\d+)?', re.M),
        
        # Pattern 2: Simpler Item/Part line with Title, pipe-separated.
        # Group 1: Item/PART ID, Group 2: Title
        re.compile(r'(?i)(?:Item|PART)\s*(\d{1,2}[A-C]?|[IVX]+)\.?\s*\|\s*([^\n|]+?)(?:\s*\|\s*\d+)?', re.M),
        
        # Pattern 3: Standalone Item/Part line with Title (no pipes separating title)
        # Group 1: Item/PART ID, Group 2: Title
        re.compile(r'(?i)^\s*(?:Item|PART)\s*(\d{1,2}[A-C]?|[IVX]+)\.?\s*([^\n|]+)', re.M),
        
        # Pattern 4: Generic TOC titles, often sub-sections or long descriptions.
        # Group 1: Title
        re.compile(r'^\s*([A-Z][A-Za-z0-9\s\',&\(\)\-\.]{15,})\s*(?:\|\s*\d+)?$', re.M),
        
        # Pattern 5: Simple "PART X" line
        # Group 1: PART ID
        re.compile(r'(?i)^\s*PART\s*([IVX]+)\s*$', re.M),
        
        # Pattern 6: Number-dot format (e.g., "1. Business") usually at start of line
        # Group 1: Item ID, Group 2: Title
        re.compile(r'^\s*(\d{1,2}[A-C]?)\.\s*([^\n|]+)', re.M),
    ]

    found_items = []
    current_part_id_context = None

    if toc_content:
        for line in toc_content.split('\n'):
            line = line.strip()
            if not line:
                continue
            
            if any(kw in line.lower() for kw in ['page', 'signatures', 'exhibit', 'index', 'table of contents']) and len(line) < 30:
                continue
            if re.match(r'^\s*\d+\s*$', line.strip()):
                continue
            if re.match(r'^\s*(\d{1,2}[A-C]?)\s*$', line.strip()):
                continue
            if len(line) < 5:
                continue
            if re.search(r'\d+\s*$', line.strip()) and not re.match(r'(?:Item|PART)\s*(\d{1,2}[A-C]?|[IVX]+)\.?', line, re.I):
                continue

            for pattern in item_patterns:
                match = pattern.search(line)
                if match:
                    item_id = None
                    item_title = ""
                    section_type_raw = 'unknown'

                    if pattern == item_patterns[0]: # Pattern 1: Complex multi-column TOC
                        part_id_cand = match.group(2) if len(match.groups()) >= 2 and match.group(2) else None
                        part_title_from_group = match.group(3) if len(match.groups()) >= 3 and match.group(3) else None
                        item_id = match.group(4).strip() if len(match.groups()) >= 4 and match.group(4) else None
                        item_title = match.group(5).strip() if len(match.groups()) >= 5 and match.group(5) else ""
                        
                        if part_id_cand:
                            current_part_id_context = f"PART {part_id_cand.strip()}"
                            title_for_part = part_title_from_group.strip() if part_title_from_group else f"PART {part_id_cand.strip()}"
                            found_items.append((part_id_cand.strip(), title_for_part, 'part', current_part_id_context))
                        
                        if item_id:
                            section_type_raw = 'item'
                            title_for_item = item_title.strip() if item_title else f"Item {item_id.strip()}"
                            found_items.append((item_id.strip(), title_for_item, section_type_raw, current_part_id_context))
                            break

                    elif pattern in [item_patterns[1], item_patterns[2], item_patterns[5]]: # Patterns with ID as group 1, Title as group 2 (or inferred from line)
                        item_id = match.group(1).strip() if match.group(1) else None
                        item_title = match.group(2).strip() if len(match.groups()) > 1 and match.group(2) else ""

                        is_item = re.match(r'^\d+[A-C]?$', item_id, re.I)
                        is_part = re.match(r'^[IVX]+$', item_id, re.I)

                        if is_item:
                            section_type_raw = 'item'
                            found_items.append((item_id, item_title, section_type_raw, current_part_id_context))
                            break
                        elif is_part:
                            section_type_raw = 'part'
                            current_part_id_context = f"PART {item_id}"
                            found_items.append((item_id, item_title, section_type_raw, current_part_id_context))
                            break
                    
                    elif pattern == item_patterns[3]: # Generic titles (Pattern 4: e.g., "Consolidated Statements of Cash Flows")
                        item_title = match.group(1).strip()
                        if item_title and len(item_title) > 10 and not re.match(r'^\d+(\.\d+)?$', item_title.replace('.', '').strip()):
                             found_items.append((None, item_title, 'named_section', current_part_id_context))
                             break
                    
                    elif pattern == item_patterns[4]: # Simple "PART X" line (Pattern 5)
                        item_id = match.group(1).strip()
                        current_part_id_context = f"PART {item_id}"
                        found_items.append((item_id, f"PART {item_id}", 'part', current_part_id_context))
                        break

    unique_items = []
    seen_keys = set()
    
    processed_items_for_dedup = []
    for item_data in found_items:
        item_id, title_raw, section_type_raw, part_context = item_data
        
        cleaned_title = re.sub(r'\|\s*\d+\s*$', '', title_raw).strip()
        cleaned_title = re.sub(r'\s*\.\s*$', '', cleaned_title).strip()
        cleaned_title = re.sub(r'\[TABLE_END\]\s*.*', '', cleaned_title, flags=re.I).strip()
        cleaned_title = re.sub(r'\s+', ' ', cleaned_title).strip()
        
        if not cleaned_title or len(cleaned_title) < 5 or re.match(r'^\d+(\.\d+)?$', cleaned_title):
            continue

        processed_items_for_dedup.append({
            'item_id': item_id,
            'title': cleaned_title,
            'type': section_type_raw,
            'part': part_context
        })

    processed_items_for_dedup.sort(key=lambda x: (x['part'] if x['part'] else '', x['item_id'] if x['item_id'] else '', x['title']))

    for item in processed_items_for_dedup:
        key = (item['item_id'], item['title'], item['type'], item['part'])
        if key not in seen_keys:
            unique_items.append(DocumentSection(
                title=item['title'],
                content="",
                section_type=item['type'],
                item_number=item['item_id'] if item['type'] == 'item' else None,
                part=item['part'],
                start_pos=0,
                end_pos=0
            ))
            seen_keys.add(key)
    
    logger.info(f"Extracted {len(unique_items)} sections from table of contents:")
    for i, sec in enumerate(unique_items[:15]):
        logger.info(f"  • ID: {sec.item_number if sec.item_number else sec.part if sec.part else 'None'}, Type: {sec.section_type}, Title: {sec.title[:60]}...")

    return unique_items


def detect_sections_robust_universal(content: str) -> List[DocumentSection]:
    """
    Universal robust section detection for all SEC filings.
    Prioritizes direct pattern matching (which handles tables well), then TOC, then page-based.
    """
    logger.info("Attempting universal SEC section detection")

    sections_strategy1 = detect_sections_universal_sec(content)

    if len(sections_strategy1) >= 3:
        logger.info(f"Universal detection successful (Strategy 1): Found {len(sections_strategy1)} sections.")
        return sections_strategy1

    logger.warning("Direct detection found few sections, analyzing table of contents.")
    toc_entries = detect_sections_from_toc_universal(content)

    if toc_entries and len(toc_entries) >= 3:
        logger.info(f"TOC analysis found {len(toc_entries)} potential sections. Attempting to extract content based on TOC titles.")

        combined_sections = []
        current_content_pos = 0

        for i, toc_entry in enumerate(toc_entries):
            pattern_parts = []
            
            if toc_entry.item_number:
                pattern_parts.append(r'Item\s*' + re.escape(toc_entry.item_number) + r'\.?')
            if toc_entry.part and toc_entry.part.startswith("PART "):
                pattern_parts.append(r'PART\s*' + re.escape(toc_entry.part.replace("PART ", "")) + r'\.?')
            
            if toc_entry.title:
                cleaned_title_for_regex = re.sub(r'\|\s*\d+', '', toc_entry.title).strip()
                cleaned_title_for_regex = re.sub(r'\s*\.\s*$', '', cleaned_title_for_regex).strip()
                cleaned_title_for_regex = re.sub(r'\s+-\s+', r'\s*[-–—]?\s*', cleaned_title_for_regex)
                cleaned_title_for_regex = re.sub(r'\s+', r'\s+', cleaned_title_for_regex)
                
                if len(cleaned_title_for_regex) > 5:
                    pattern_parts.append(r'\b?' + re.escape(cleaned_title_for_regex) + r'\b?')
                else:
                    pattern_parts.append(re.escape(cleaned_title_for_regex))
                
            if not pattern_parts:
                logger.warning(f"No valid pattern parts for TOC entry: '{toc_entry.title}'. Skipping.")
                continue

            search_pattern = re.compile(r'(?i)^\s*(?:' + '|'.join(pattern_parts) + r')', re.M)
            
            match = search_pattern.search(content, pos=current_content_pos)

            if match:
                start_pos = match.start()
                
                next_start_pos = len(content)
                if i + 1 < len(toc_entries):
                    next_toc_entry = toc_entries[i+1]
                    next_pattern_parts = []
                    if next_toc_entry.item_number:
                        next_pattern_parts.append(r'Item\s*' + re.escape(next_toc_entry.item_number) + r'\.?')
                    elif next_toc_entry.part and next_toc_entry.part.startswith("PART "):
                        next_pattern_parts.append(r'PART\s*' + re.escape(next_toc_entry.part.replace("PART ", "")) + r'\.?')
                    if next_toc_entry.title:
                        next_cleaned_title_for_regex = re.sub(r'\|\s*\d+', '', next_toc_entry.title).strip()
                        next_cleaned_title_for_regex = re.sub(r'\s*\.\s*$', '', next_cleaned_title_for_regex).strip()
                        next_cleaned_title_for_regex = re.sub(r'\s+-\s+', r'\s*[-–—]?\s*', next_cleaned_title_for_regex)
                        next_cleaned_title_for_regex = re.sub(r'\s+', r'\s+', next_cleaned_title_for_regex)
                        if len(next_cleaned_title_for_regex) > 5:
                            next_pattern_parts.append(r'\b?' + re.escape(next_cleaned_title_for_regex) + r'\b?')
                        else:
                            next_pattern_parts.append(re.escape(next_cleaned_title_for_regex))

                    if next_pattern_parts:
                        next_pattern = re.compile(r'(?i)^\s*(?:' + '|'.join(next_pattern_parts) + r')', re.M)
                        next_match = next_pattern.search(content, pos=match.end())
                        if next_match:
                            next_start_pos = next_match.start()
                
                section_content = content[start_pos:next_start_pos].strip()
                
                combined_sections.append(DocumentSection(
                    title=toc_entry.title,
                    content=section_content,
                    section_type=toc_entry.section_type,
                    item_number=toc_entry.item_number,
                    part=toc_entry.part,
                    start_pos=start_pos,
                    end_pos=next_start_pos
                ))
                current_content_pos = next_start_pos
            else:
                logger.warning(f"Could not find content for TOC entry: '{toc_entry.title}'. This section might be merged with previous or skipped.")

        if len(combined_sections) >= 3:
            logger.info(f"Universal detection successful (TOC-based content mapping): Found {len(combined_sections)} sections.")
            return combined_sections
        else:
            logger.warning("TOC-based content mapping yielded few sections. Falling back to page-based detection.")


    logger.warning("Trying page-based detection as fallback.")
    sections_strategy2 = detect_sections_strategy_2(content)

    if len(sections_strategy2) >= 2:
        logger.info(f"Page-based detection successful: Found {len(sections_strategy2)} sections.")
        return sections_strategy2

    logger.warning("All strategies failed, creating single section.")
    return [DocumentSection(
        title="Full Document",
        content=content,
        section_type='document',
        start_pos=0,
        end_pos=len(content)
    )]

# =============================================================================
# MAIN PROCESSING FUNCTION (Universal)
# =============================================================================
def process_filing_robust_universal(file_path: str, target_tokens: int = 500, overlap_tokens: int = 100) -> List[Chunk]:
    """
    Universal processing function for all SEC filings
    """
    try:
        filing_metadata = extract_metadata_from_filename(file_path)
        filename = Path(file_path).name
        file_id = filename.replace(".txt", "")

        with open(file_path, 'r', encoding='utf-8') as f:
            raw_content = f.read()
        cleaned_content = clean_sec_text(raw_content)

        if not cleaned_content.strip():
            logger.warning(f"Cleaned content for {filename} is empty. No chunks created.")
            return []

        sections = detect_sections_robust_universal(cleaned_content)
        logger.info(f"Found {len(sections)} sections in {filename}")

        all_chunks = []
        chunk_counter = 0

        for section in sections:
            logger.debug(f"Processing section: '{section.title}', Content len: {len(section.content)}, Start: {section.start_pos}, End: {section.end_pos}") # Added debug

            if not section.content.strip():
                continue

            tables_in_section, narrative_content_in_section = extract_and_process_tables(section.content)

            section_info = create_section_info(section, filing_metadata.form_type)

            for table in tables_in_section:
                chunk = Chunk(
                    chunk_id=f"{file_id}-chunk-{chunk_counter:04d}",
                    text=table['text'],
                    token_count=table['token_count'],
                    chunk_type='table',
                    section_info=section_info,
                    filing_metadata=filing_metadata,
                    chunk_index=chunk_counter,
                    has_overlap=False
                )
                all_chunks.append(chunk)
                chunk_counter += 1

            if narrative_content_in_section.strip():
                narrative_sub_chunks = create_overlapping_chunks(
                    narrative_content_in_section, target_tokens, overlap_tokens
                )

                for chunk_data in narrative_sub_chunks:
                    chunk = Chunk(
                        chunk_id=f"{file_id}-chunk-{chunk_counter:04d}",
                        text=chunk_data['text'],
                        token_count=chunk_data['token_count'],
                        chunk_type='narrative',
                        section_info=section_info,
                        filing_metadata=filing_metadata,
                        chunk_index=chunk_counter,
                        has_overlap=chunk_data['has_overlap']
                    )
                    all_chunks.append(chunk)
                    chunk_counter += 1

        logger.info(f"Created {len(all_chunks)} chunks for {filename}")
        return all_chunks

    except Exception as e:
        logger.error(f"Error processing {file_path}: {e}")
        return []

# =============================================================================
# 5. IMPROVED SENTENCE-AWARE CHUNKING
# =============================================================================

def split_into_sentences(text: str) -> List[str]:
    """
    Split text into sentences using multiple heuristics
    """
    sentences = re.split(r'(?<=[.!?])\s+(?=[A-Z])', text)

    sentences = [s.strip() for s in sentences if s.strip()]

    return sentences

def create_overlapping_chunks(text: str, target_tokens: int = 500, overlap_tokens: int = 100,
                            min_tokens: int = 50) -> List[Dict[str, Any]]:
    """
    Create semantically aware chunks with overlap
    """
    sentences = split_into_sentences(text)
    chunks = []

    current_chunk_sentences = []
    current_tokens = 0

    for i, sentence in enumerate(sentences):
        sentence_tokens = len(encoding.encode(sentence))

        if current_tokens + sentence_tokens > target_tokens and current_chunk_sentences:
            chunk_text = ' '.join(current_chunk_sentences)
            chunks.append({
                'text': chunk_text,
                'token_count': current_tokens,
                'sentence_count': len(current_chunk_sentences),
                'has_overlap': len(chunks) > 0
            })

            overlap_sentences = []
            current_overlap_tokens = 0

            for sent_idx in range(len(current_chunk_sentences) - 1, -1, -1):
                sent = current_chunk_sentences[sent_idx]
                sent_tokens = len(encoding.encode(sent))
                if current_overlap_tokens + sent_tokens <= overlap_tokens:
                    overlap_sentences.insert(0, sent)
                    current_overlap_tokens += sent_tokens
                else:
                    break
            
            if not overlap_sentences and current_chunk_sentences:
                overlap_sentences = [current_chunk_sentences[-1]]
                current_overlap_tokens = len(encoding.encode(overlap_sentences[0]))


            current_chunk_sentences = overlap_sentences + [sentence]
            current_tokens = current_overlap_tokens + sentence_tokens
        else:
            current_chunk_sentences.append(sentence)
            current_tokens += sentence_tokens

    if current_chunk_sentences:
        chunk_text = ' '.join(current_chunk_sentences)
        final_tokens = len(encoding.encode(chunk_text))

        if final_tokens >= min_tokens:
            chunks.append({
                'text': chunk_text,
                'token_count': final_tokens,
                'sentence_count': len(current_chunk_sentences),
                'has_overlap': len(chunks) > 0
            })

    return chunks

# =============================================================================
# 6. TABLE HANDLING
# =============================================================================

def extract_and_process_tables(content: str) -> Tuple[List[Dict], str]:
    """
    Extract tables and return both table chunks and narrative text
    """
    table_pattern = re.compile(r'=== TABLE START ===.*?=== TABLE END ===', re.DOTALL)
    tables = []

    for i, match in enumerate(table_pattern.finditer(content)):
        table_content = match.group(0)
        table_text = table_content.replace('=== TABLE START ===', '').replace('=== TABLE END ===', '').strip()

        if table_text:
            tables.append({
                'text': table_text,
                'token_count': len(encoding.encode(table_text)),
                'table_index': i,
                'chunk_type': 'table'
            })

    narrative_content = table_pattern.sub('', content).strip()

    return tables, narrative_content

# =============================================================================
# 8. TESTING AND VALIDATION
# =============================================================================

def validate_chunks(chunks: List[Chunk]) -> Dict[str, Any]:
    """
    Validate the quality of our chunks
    """
    if not chunks:
        return {"error": "No chunks created"}

    token_counts = [chunk.token_count for chunk in chunks]

    stats = {
        "total_chunks": len(chunks),
        "avg_tokens": sum(token_counts) / len(token_counts),
        "min_tokens": min(token_counts),
        "max_tokens": max(token_counts),
        "chunks_with_overlap": sum(1 for chunk in chunks if chunk.has_overlap),
        "table_chunks": sum(1 for chunk in chunks if chunk.chunk_type == 'table'),
        "narrative_chunks": sum(1 for chunk in chunks if chunk.chunk_type == 'narrative'),
        "unique_sections": len(set(chunk.section_info for chunk in chunks))
    }

    return stats

# =============================================================================
# 9. LET'S TEST THIS!
# =============================================================================

print("🚀 SEC Filing Preprocessing Strategy - Ready for Testing!\n")
print("="*60)
print("Key improvements over original approach:\n")
print("✅ Multi-strategy section detection with fallbacks\n")
print("✅ Sentence-aware chunking with overlap\n")
print("✅ Robust error handling and logging\n")
print("✅ Structured data classes for better organization\n")
print("✅ Quality validation and statistics\n")
print("✅ Separate table and narrative processing\n")
print("="*60)


def test_single_file():
    """Test our preprocessing on a single file"""
    test_file = "processed_filings/AAPL/AAPL_10K_2020-10-30.txt"

    if os.path.exists(test_file):
        print(f"🧪 Testing with: {test_file}\n")
        print("="*50)

        chunks = process_filing_robust_universal(test_file)
        stats = validate_chunks(chunks)

        print("📊 Processing Results:\n")
        for key, value in stats.items():
            print(f"  {key}: {value}\n")

        print("\n📝 Sample Chunks:\n")
        for i, chunk in enumerate(chunks[:3]):
            print(f"\nChunk {i+1} ({chunk.chunk_type}):\n")
            print(f"  Section: {chunk.section_info}\n")
            print(f"  Tokens: {chunk.token_count}\n")
            print(f"  Text preview: {chunk.text[:200]}...\n")

        return chunks
    else:
        print(f"❌ File not found: {test_file}\n")
        print("Please update the file path to match your data structure\n")
        return []

chunks = test_single_file()

def compare_section_strategies(content: str):
    """Compare how different strategies perform"""
    print("🔍 Comparing Section Detection Strategies\n")
    print("="*50)

    sections_1 = detect_sections_strategy_1_improved(content)
    print(f"Strategy 1 (Regex): {len(sections_1)} sections\n")
    for i, section in enumerate(sections_1[:5]):
        print(f"  {i+1}. {section.title[:60]}...\n")

    print()

    sections_2 = detect_sections_strategy_2(content)
    print(f"Strategy 2 (Page-based): {len(sections_2)} sections\n")
    for i, section in enumerate(sections_2[:5]):
        print(f"  {i+1}. {section.title[:60]}...\n")

    return sections_1, sections_2

if chunks:
    test_file = chunks[0].filing_metadata.file_path
    with open(test_file, 'r', encoding='utf-8') as f:
        full_content_for_comparison = f.read()
    cleaned_content_for_comparison = clean_sec_text(full_content_for_comparison)

    sections_1_comp, sections_2_comp = compare_section_strategies(cleaned_content_for_comparison)


def analyze_chunking_quality(chunks: List[Chunk]):
    """Deep dive into chunk quality"""
    if not chunks:
        print("No chunks to analyze\n")
        return

    print("📊 Chunking Quality Analysis\n")
    print("="*50)

    token_counts = [chunk.token_count for chunk in chunks]

    print(f"Token Distribution:\n")
    print(f"  Mean: {sum(token_counts)/len(token_counts):.1f}\n")
    print(f"  Median: {sorted(token_counts)[len(token_counts)//2]}\n")
    print(f"  Min: {min(token_counts)}\n")
    print(f"  Max: {max(token_counts)}\n")

    print(f"\nChunk Types:\n")
    chunk_types = {}
    for chunk in chunks:
        chunk_types[chunk.chunk_type] = chunk_types.get(chunk.chunk_type, 0) + 1
    for chunk_type, count in chunk_types.items():
        print(f"  {chunk_type}: {count}\n")

    print(f"\nSection Distribution:\n")
    sections_dist = {}
    for chunk in chunks:
        sections_dist[chunk.section_info] = sections_dist.get(chunk.section_info, 0) + 1
    for section, count in sorted(sections_dist.items()):
        print(f"  {section}: {count} chunks\n")

    overlap_count = sum(1 for chunk in chunks if chunk.has_overlap)
    print(f"\nOverlap Analysis:\n")
    print(f"  Chunks with overlap: {overlap_count}/{len(chunks)} ({overlap_count/len(chunks)*100:.1f}%)\n")

    return {
        'token_stats': {
            'mean': sum(token_counts)/len(token_counts),
            'median': sorted(token_counts)[len(token_counts)//2],
            'min': min(token_counts),
            'max': max(token_counts)
        },
        'chunk_types': chunk_types,
        'sections': sections_dist,
        'overlap_rate': overlap_count/len(chunks)
    }

if chunks:
    quality_analysis = analyze_chunking_quality(chunks)


def test_chunking_parameters():
    """Test different parameter combinations"""
    if not chunks:
        print("No test file processed yet\n")
        return

    test_file = chunks[0].filing_metadata.file_path

    print("🔧 Testing Different Chunking Parameters\n")
    print("="*50)

    param_configs = [
        {"target_tokens": 300, "overlap_tokens": 50, "name": "Small chunks, low overlap"},
        {"target_tokens": 500, "overlap_tokens": 100, "name": "Medium chunks, medium overlap"},
        {"target_tokens": 800, "overlap_tokens": 150, "name": "Large chunks, high overlap"},
    ]

    results = {}

    for config in param_configs:
        print(f"\n🧪 Testing: {config['name']}\n")
        test_chunks = process_filing_robust_universal(
            test_file,
            target_tokens=config['target_tokens'],
            overlap_tokens=config['overlap_tokens']
        )

        stats = validate_chunks(test_chunks)
        results[config['name']] = stats

        print(f"  Total chunks: {stats['total_chunks']}\n")
        print(f"  Avg tokens: {stats['avg_tokens']:.1f}\n")
        print(f"  Overlap rate: {stats['chunks_with_overlap']}/{stats['total_chunks']}\n")

    return results

param_results = test_chunking_parameters()


def test_error_handling():
    """Test how our system handles various edge cases"""
    print("🛡️ Testing Error Handling\n")
    print("="*50)

    print("Test 1: Non-existent file\n")
    fake_chunks = process_filing_robust_universal("non_existent_file.txt")
    print(f"  Result: {len(fake_chunks)} chunks (expected 0)\n")

    print("\nTest 2: Empty content\n")
    empty_sections = detect_sections_robust_universal("")
    print(f"  Result: {len(empty_sections)} sections\n")

    print("\nTest 3: Malformed filename\n")
    import tempfile
    with tempfile.NamedTemporaryFile(mode='w', suffix='_bad_name.txt', delete=False) as f:
        f.write("Some content")
        temp_file = f.name

    bad_chunks = process_filing_robust_universal(temp_file)
    print(f"  Result: {len(bad_chunks)} chunks (expected 0)\n")

    os.unlink(temp_file)

    print("\nTest 4: Very short text\n")
    short_chunks = create_overlapping_chunks("Short text.", target_tokens=500)
    print(f"  Result: {len(short_chunks)} chunks\n")

test_error_handling()


def test_batch_processing(max_files: int = 5):
    """Test processing multiple files"""
    print(f"🔄 Testing Batch Processing (max {max_files} files)\n")
    print("="*50)

    data_path = "processed_filings/"
    if not os.path.exists(data_path):
        print(f"❌ Data path not found: {data_path}\n")
        return []

    all_files = []
    for root, dirs, files in os.walk(data_path):
        for file in files:
            if file.endswith('.txt'):
                all_files.append(os.path.join(root, file))

    test_files = all_files[:max_files]
    print(f"Processing {len(test_files)} files...\n")

    all_results = []

    for i, file_path in enumerate(test_files):
        print(f"  {i+1}/{len(test_files)}: {os.path.basename(file_path)}\n")

        file_chunks = process_filing_robust_universal(file_path)
        stats = validate_chunks(file_chunks)

        all_results.append({
            'file': os.path.basename(file_path),
            'chunks': len(file_chunks),
            'avg_tokens': stats.get('avg_tokens', 0),
            'sections': stats.get('unique_sections', 0),
            'tables': stats.get('table_chunks', 0)
        })

    print(f"\n📊 Batch Processing Summary:\n")
    total_chunks = sum(r['chunks'] for r in all_results)
    avg_chunks_per_file = total_chunks / len(all_results) if all_results else 0

    print(f"  Total files processed: {len(all_results)}\n")
    print(f"  Total chunks created: {total_chunks}\n")
    print(f"  Average chunks per file: {avg_chunks_per_file:.1f}\n")

    print(f"\n📋 Per-file results:\n")
    for result in all_results:
        print(f"  {result['file']}: {result['chunks']} chunks, {result['sections']} sections, {result['tables']} tables\n")

    return all_results

batch_results = test_batch_processing(max_files=3)


def create_analysis_summary():
    """Create a comprehensive summary of our preprocessing"""
    print("📈 Final Analysis Summary\n")
    print("="*60)

    if 'chunks' not in globals() or not chunks:
        print("No chunks to analyze - run test_single_file() first\n")
        return

    chunk_data = []
    for chunk in chunks:
        chunk_data.append({
            'chunk_id': chunk.chunk_id,
            'tokens': chunk.token_count,
            'type': chunk.chunk_type,
            'section': chunk.section_info,
            'has_overlap': chunk.has_overlap,
            'ticker': chunk.filing_metadata.ticker,
            'form_type': chunk.filing_metadata.form_type,
            'fiscal_year': chunk.filing_metadata.fiscal_year
        })

    df = pd.DataFrame(chunk_data)

    print("🎯 Key Insights:\n")
    print(f"  • Document: {df['ticker'].iloc[0]} {df['form_type'].iloc[0]} (FY{df['fiscal_year'].iloc[0]})\n")
    print(f"  • Total chunks: {len(df)}\n")
    print(f"  • Average chunk size: {df['tokens'].mean():.0f} tokens\n")
    print(f"  • Size range: {df['tokens'].min()} - {df['tokens'].max()} tokens\n")
    print(f"  • Overlap rate: {(df['has_overlap'].sum() / len(df) * 100):.1f}%\n")

    print(f"\n📊 Chunk Distribution by Type:\n")
    type_dist = df['type'].value_counts()
    for chunk_type, count in type_dist.items():
        percentage = (count / len(df)) * 100
        print(f"  • {chunk_type}: {count} chunks ({percentage:.1f}%)\n")

    print(f"\n📚 Section Breakdown:\n")
    section_dist = df['section'].value_counts()
    for section, count in section_dist.head(8).items():
        print(f"  • {section}: {count} chunks\n")

    print(f"\n✅ Quality Metrics:\n")
    small_chunks = df[df['tokens'] < 50]
    print(f"  • Very small chunks (<50 tokens): {len(small_chunks)} ({len(small_chunks)/len(df)*100:.1f}%)\n")

    large_chunks = df[df['tokens'] > 800]
    print(f"  • Large chunks (>800 tokens): {len(large_chunks)} ({len(large_chunks)/len(df)*100:.1f}%)\n")

    unique_sections = df['section'].nunique()
    print(f"  • Unique sections identified: {unique_sections}\n")

    print(f"\n🔍 Sample Chunks for Review:\n")
    for chunk_type in df['type'].unique():
        sample = df[df['type'] == chunk_type].iloc[0]
        chunk_obj = next(c for c in chunks if c.chunk_id == sample['chunk_id'])
        print(f"\n  {chunk_type.upper()} example ({sample['tokens']} tokens):\n")
        print(f"    Section: {sample['section']}\n")
        print(f"    Preview: {chunk_obj.text[:150]}...\n")

    return df

summary_df = create_analysis_summary()


def compare_with_original():
    """Compare our approach with the original chunking strategy"""
    print("⚖️ Comparison: New vs Original Approach\n")
    print("="*60)

    improvements = [
        "✅ Multi-strategy section detection (fallbacks for robustness)",
        "✅ Sentence-aware chunking (preserves semantic boundaries)",
        "✅ Overlapping chunks (maintains context across boundaries)",
        "✅ Separate table processing (handles structured data better)",
        "✅ Comprehensive error handling (graceful degradation)",
        "✅ Rich metadata structure (better for search/filtering)",
        "✅ Quality validation (ensures chunk coherence)",
        "✅ Configurable parameters (tunable for different use cases)"
    ]

    potential_tradeoffs = [
        "⚠️ Slightly more complex code (but more maintainable)",
        "⚠️ More chunks due to overlap (but better retrieval)",
        "⚠️ Processing takes longer (but more robust results)"
    ]

    print("🚀 Key Improvements:\n")
    for improvement in improvements:
        print(f"  {improvement}\n")

    print(f"\n⚖️ Potential Tradeoffs:\n")
    for tradeoff in potential_tradeoffs:
        print(f"  {tradeoff}\n")

    print(f"\n🎯 Recommended Next Steps:\n")
    next_steps = [
        "1. Test on more diverse filings to validate robustness",
        "2. Fine-tune chunking parameters based on embedding performance",
        "3. Add semantic similarity checks between overlapping chunks",
        "4. Implement incremental processing for large datasets",
        "5. Add support for other SEC forms (8-K, DEF 14A, etc.)",
        "6. Create embedding quality metrics and evaluation"
    ]

    for step in next_steps:
        print(f"  {step}\n")

    print("\n" + "="*60)
    print("🎉 Preprocessing Strategy Testing Complete!\n")
    print("="*60)
    print("Next step: Convert this notebook into modular Python files\n")
    print("Then: Implement the embedding pipeline and MCP server!\n")
    print("="*60)

compare_with_original()

print("🚀 Ready to test universal SEC detection!\n")
print("\n1. Run test_universal_detection_fixed() to test all files\n")
print("2. Run compare_old_vs_universal_fixed() to see the improvement\n")
print("3. Run quick_pattern_test_fixed() to see what patterns match\n")

def extract_metadata_from_filename(file_path: str) -> FilingMetadata:
    filename = Path(file_path).name
    file_id = filename.replace(".txt", "")
    parts = file_id.split('_')

    if len(parts) != 3:
        logger.warning(f"Malformed filename: {filename}. Using default metadata.")
        return FilingMetadata(
            ticker="UNKNOWN",
            form_type="UNKNOWN",
            filing_date="1900-01-01",
            fiscal_year=1900,
            fiscal_quarter=1,
            file_path=file_path
        )

    ticker, form_type, filing_date_str = parts

    try:
        filing_date = pd.to_datetime(filing_date_str)
        fiscal_year = filing_date.year
        fiscal_quarter = filing_date.quarter
    except pd.errors.ParserError:
        logger.error(f"Could not parse filing date from {filing_date_str} in {filename}. Using default values.")
        fiscal_year = 1900
        fiscal_quarter = 1

    if form_type == '10K' and filing_date.month <= 3:
        fiscal_year -= 1

    return FilingMetadata(
        ticker=ticker,
        form_type=form_type,
        filing_date=filing_date_str,
        fiscal_year=fiscal_year,
        fiscal_quarter=fiscal_quarter,
        file_path=file_path
    )


# Define the _fixed test functions so they are available when called below
def test_universal_detection_fixed():
    """Test the universal detection on all your file types"""

    test_files = [
        "processed_filings/AAPL/AAPL_10K_2020-10-30.txt",
        "processed_filings/AMZN/AMZN_10K_2023-02-03.txt",
        "processed_filings/AMZN/AMZN_10Q_2024-11-01.txt",
        "processed_filings/KO/KO_10Q_2020-07-22.txt"
    ]

    results = {}

    for test_file in test_files:
        if not os.path.exists(test_file):
            print(f"⚠️ Skipping {test_file} - file not found\n")
            continue

        print(f"\n🧪 Testing: {test_file}\n")
        print("=" * 80)

        with open(test_file, 'r', encoding='utf-8') as f:
            content = f.read()

        sections = detect_sections_robust_universal(content)

        print(f"\n✅ Found {len(sections)} sections:\n")
        for i, section in enumerate(sections[:10]):
            print(f"  {i+1}. {section.title}\n")
            print(f"     Type: {section.section_type}, Length: {len(section.content):,} chars\n")

        chunks = process_filing_robust_universal(test_file)
        stats = validate_chunks(chunks) if chunks else {"error": "No chunks created"}

        results[test_file] = {
            'sections': len(sections),
            'chunks': len(chunks) if chunks else 0,
            'stats': stats
        }

        print(f"\n📊 Processing Results:\n")
        for key, value in stats.items():
            print(f"  {key}: {value}\n")

        if chunks:
            section_counts = {}
            for chunk in chunks[:20]:
                section = chunk.section_info
                section_counts[section] = section_counts.get(section, 0) + 1

            print(f"\n📚 Section Distribution (sample):\n")
            for section, count in sorted(section_counts.items()):
                print(f"  • {section}: {count} chunks\n")

    print(f"\n" + "="*80)
    print("📊 UNIVERSAL DETECTION SUMMARY\n")
    print("="*80)

    for file_path, result in results.items():
        filename = file_path.split('/')[-1]
        print(f"{filename:<25} | {result['sections']:>2} sections | {result['chunks']:>3} chunks\n")

    return results

def compare_old_vs_universal_fixed():
    """Compare the old detection vs universal detection"""
    test_file = "processed_filings/AAPL/AAPL_10K_2020-10-30.txt"

    if not os.path.exists(test_file):
        print("Test file not found for comparison\n")
        return

    print("⚖️ OLD vs UNIVERSAL Detection Comparison\n")
    print("="*60)

    with open(test_file, 'r', encoding='utf-8') as f:
        content = f.read()

    print("Running old detection...\n")
    old_sections = detect_sections_robust_old(content)

    print("Running universal detection...\n")
    new_sections = detect_sections_robust_universal(content)

    print(f"\n📊 Comparison Results:\n")
    print(f"  Old detection: {len(old_sections)} sections\n")
    print(f"  Universal detection: {len(new_sections)} sections\n")
    print(f"  Improvement: +{len(new_sections) - len(old_sections)} sections\n")

    print(f"\n📋 Old Sections:\n")
    for i, section in enumerate(old_sections):
        print(f"  {i+1}. {section.title}\n")

    print(f"\n📋 Universal Sections:\n")
    for i, section in enumerate(new_sections):
        print(f"  {i+1}. {section.title}\n")

    return old_sections, new_sections

def quick_pattern_test_fixed():
    """Quick test to see what patterns match in your content"""
    test_file = "processed_filings/AAPL/AAPL_10K_2020-10-30.txt"

    if not os.path.exists(test_file):
        print("Test file not found\n")
        return

    print("🔍 QUICK PATTERN TEST\n")
    print("="*50)

    with open(test_file, 'r', encoding='utf-8') as f:
        content = f.read()

    patterns = [
        (re.compile(r'\[TABLE_START\](?:.|\n)*?Item(?:.|\n)*?\[TABLE_END\]', re.I | re.DOTALL), "Table-wrapped Items"),
        (re.compile(r'Item\s+\d+[A-C]?\.\s*\|', re.I), "Pipe-separated Items"),
        (re.compile(r'PART\s+[IVX]+', re.I), "Part headers"),
        (re.compile(r'\[TABLE_START\](?:.|\n)*?PART(?:.|\n)*?\[TABLE_END\]', re.I | re.DOTALL), "Table-wrapped Parts"),
    ]

    for compiled_pattern, description in patterns:
        matches = compiled_pattern.findall(content)
        print(f"\n{description}: {len(matches)} matches\n")
        for i, match in enumerate(matches[:3]):
            clean_match = ' '.join(match.split())[:100]
            print(f"  {i+1}: {clean_match}...\n")

# Run the fixed tests
results_universal = test_universal_detection_fixed()
old_vs_new_sections = compare_old_vs_universal_fixed()
quick_pattern_test_fixed()

INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 0 unique sections:
INFO:__main__:Found table of contents (1367 chars)
INFO:__main__:Extracted 0 sections from table of contents:
INFO:__main__:Found 1 sections in AAPL_10K_2020-10-30.txt
INFO:__main__:Created 172 chunks for AAPL_10K_2020-10-30.txt
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 0 unique sections:
INFO:__main__:Found table of contents (1367 chars)
INFO:__main__:Extracted 0 sections from table of contents:
INFO:__main__:Found 1 sections in AAPL_10K_2020-10-30.txt
INFO:__main__:Created 262 chunks for AAPL_10K_2020-10-30.txt
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 0 unique sections:
INFO:__main__:Found table of contents (1367 chars)
INFO:__main__:Extracted 0 sections from table of contents:
INFO:__main__:Found 1 sections in AAPL_10K_2020-10-30.txt


🚀 SEC Filing Preprocessing Strategy - Ready for Testing!

Key improvements over original approach:

✅ Multi-strategy section detection with fallbacks

✅ Sentence-aware chunking with overlap

✅ Robust error handling and logging

✅ Structured data classes for better organization

✅ Quality validation and statistics

✅ Separate table and narrative processing

🧪 Testing with: processed_filings/AAPL/AAPL_10K_2020-10-30.txt

📊 Processing Results:

  total_chunks: 172

  avg_tokens: 379.86046511627904

  min_tokens: 38

  max_tokens: 1692

  chunks_with_overlap: 105

  table_chunks: 66

  narrative_chunks: 106

  unique_sections: 1


📝 Sample Chunks:


Chunk 1 (table):

  Section: Full Document

  Tokens: 58

  Text preview: California | 94-2404110 | (State or other jurisdiction | of incorporation or organization) | (I.R.S. Employer Identification No.) | One Apple Park Way | Cupertino | , | California | 95014 | (Address o...


Chunk 2 (table):

  Section: Full Document

  Tokens: 240

  Text 

INFO:__main__:Created 172 chunks for AAPL_10K_2020-10-30.txt
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 0 unique sections:
INFO:__main__:Found table of contents (1367 chars)
INFO:__main__:Extracted 0 sections from table of contents:
INFO:__main__:Found 1 sections in AAPL_10K_2020-10-30.txt
INFO:__main__:Created 127 chunks for AAPL_10K_2020-10-30.txt
ERROR:__main__:Error processing non_existent_file.txt: Unknown datetime string format, unable to parse: file, at position 0
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:Empty content provided to detect_sections_universal_sec. Returning empty sections.
INFO:__main__:Empty content provided to detect_sections_from_toc_universal. Returning empty sections.
ERROR:__main__:Error processing /var/folders/pj/bmp5122d3d77bzq_cvf0wbl40000gn/T/tmp4bq4d8qd_bad_name.txt: Unknown datetime string format, unable to parse: name, at position 0
INFO:__main__:Attempting univers

  Total chunks: 172

  Avg tokens: 379.9

  Overlap rate: 105/172


🧪 Testing: Large chunks, high overlap

  Total chunks: 127

  Avg tokens: 495.8

  Overlap rate: 60/127

🛡️ Testing Error Handling

Test 1: Non-existent file

  Result: 0 chunks (expected 0)


Test 2: Empty content

  Result: 1 sections


Test 3: Malformed filename

  Result: 0 chunks (expected 0)


Test 4: Very short text

  Result: 0 chunks

🔄 Testing Batch Processing (max 3 files)

Processing 3 files...

  1/3: AMZN_10Q_2022-04-29.txt

  2/3: AMZN_10Q_2020-05-01.txt

  3/3: AMZN_10Q_2020-10-30.txt


📊 Batch Processing Summary:

  Total files processed: 3

  Total chunks created: 440

  Average chunks per file: 146.7


📋 Per-file results:

  AMZN_10Q_2022-04-29.txt: 125 chunks, 1 sections, 51 tables

  AMZN_10Q_2020-05-01.txt: 195 chunks, 1 sections, 131 tables

  AMZN_10Q_2020-10-30.txt: 120 chunks, 1 sections, 48 tables

📈 Final Analysis Summary

🎯 Key Insights:

  • Document: AAPL 10K (FY2020)

  • Total chunks: 1

INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 19 unique sections:
INFO:__main__:  1: Item/Part 1 - Business...
INFO:__main__:  2: Item/Part 1A - Risk Factors...
INFO:__main__:  3: Item/Part 1B - Unresolved Staff Comments...
INFO:__main__:  4: Item/Part 3 - Legal Proceedings...
INFO:__main__:  5: Item/Part 4 - Mine Safety Disclosures...
INFO:__main__:  6: Item/Part 5 - Market for Registrant’s Common Equity, Related Stockholder M...
INFO:__main__:  7: Item/Part 6 - Selected Financial Data...
INFO:__main__:  8: Item/Part 7 - Management’s Discussion and Analysis of Financial Condition ...
INFO:__main__:  9: Item/Part 7A - Quantitative and Qualitative Disclosures About Market Risk...
INFO:__main__:  10: Item/Part 8 - Financial Statements and Supplementary Data...
INFO:__main__:  11: Item/Part 9 - Changes in and Disagreements with Accountants on Accounting ...
INFO:__main__:  12: Item/Part 9A - Controls and Procedures...
INFO:__main__:

  • Unique sections identified: 1


🔍 Sample Chunks for Review:


  TABLE example (58 tokens):

    Section: Full Document

    Preview: California | 94-2404110 | (State or other jurisdiction | of incorporation or organization) | (I.R.S. Employer Identification No.) | One Apple Park Way...


  NARRATIVE example (420 tokens):

    Section: Full Document

    Preview: aapl-20200926-K(Mark One)☒ ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934For the fiscal year ended September 26,...

⚖️ Comparison: New vs Original Approach

🚀 Key Improvements:

  ✅ Multi-strategy section detection (fallbacks for robustness)

  ✅ Sentence-aware chunking (preserves semantic boundaries)

  ✅ Overlapping chunks (maintains context across boundaries)

  ✅ Separate table processing (handles structured data better)

  ✅ Comprehensive error handling (graceful degradation)

  ✅ Rich metadata structure (better for search/filtering)

  ✅ Quality validation (ensures chunk coherenc

INFO:__main__:Created 210 chunks for AMZN_10K_2023-02-03.txt
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 11 unique sections:
INFO:__main__:  1: Item/Part 1 - Financial Statements...
INFO:__main__:  2: Item/Part unknown - Legal Proceedings...
INFO:__main__:  3: Item/Part 2 - Management’s Discussion and Analysis of Financial Condition ...
INFO:__main__:  4: Item/Part 3 - Quantitative and Qualitative Disclosures About Market Risk...
INFO:__main__:  5: Item/Part 4 - Controls and Procedures...
INFO:__main__:  6: Item/Part 1 - Legal Proceedings...
INFO:__main__:  7: Item/Part 1A - Risk Factors...
INFO:__main__:  8: Item/Part 2 - Unregistered Sales of Equity Securities and Use of Proceeds...
INFO:__main__:  9: Item/Part 3 - Defaults Upon Senior Securities...
INFO:__main__:  10: Item/Part 5 - Other Information...
INFO:__main__:  11: Item/Part 6 - Exhibits...
INFO:__main__:Universal detection successful (Strategy 1): Found 11 sections.



📊 Processing Results:

  total_chunks: 210

  avg_tokens: 332.1666666666667

  min_tokens: 6

  max_tokens: 1157

  chunks_with_overlap: 119

  table_chunks: 90

  narrative_chunks: 120

  unique_sections: 1


📚 Section Distribution (sample):

  • Full Document: 20 chunks


🧪 Testing: processed_filings/AMZN/AMZN_10Q_2024-11-01.txt


✅ Found 11 sections:

  1. Item 1 - FINANCIAL STATEMENTS

     Type: item, Length: 34,940 chars

  2. Legal Proceedings

     Type: named_section, Length: 32,116 chars

  3. Item 2 - MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIAL CONDITION AND RESULTS OF OPERATIONS

     Type: item, Length: 45,107 chars

  4. Item 3 - QUANTITATIVE AND QUALITATIVE DISCLOSURES ABOUT MARKET RISK

     Type: item, Length: 4,405 chars

  5. Item 4 - CONTROLS AND PROCEDURES

     Type: item, Length: 2,104 chars

  6. Item 1 - LEGAL PROCEEDINGS

     Type: item, Length: 162 chars

  7. Item 1A - RISK FACTORS

     Type: item, Length: 59,433 chars

  8. Item 2 - UNREGISTERED SA

TypeError: expected string or bytes-like object, got 'NoneType'

*** All chuinks Processed ^ now just need to get typeerror working.

In [32]:
import os
import re
import pandas as pd
import tiktoken
from typing import List, Dict, Any, Tuple, Optional
from dataclasses import dataclass
from datetime import datetime
import logging
from pathlib import Path

# Set up logging to DEBUG level to see detailed process
logging.basicConfig(level=logging.DEBUG) # CRITICAL CHANGE: Set to DEBUG
logger = logging.getLogger(__name__)

# Initialize tokenizer for accurate token counting
encoding = tiktoken.encoding_for_model("text-embedding-3-small")

# =============================================================================
# 1. SEC MAPPINGS WITH FALLBACKS
# =============================================================================

ITEM_NAME_MAP_10K = {
    "1": "Business",
    "1A": "Risk Factors",
    "1B": "Unresolved Staff Comments",
    "1C": "Cybersecurity",
    "2": "Properties",
    "3": "Legal Proceedings",
    "4": "Mine Safety Disclosures",
    "5": "Market for Registrant's Common Equity, Related Stockholder Matters and Issuer Purchases of Equity Securities",
    "6": "Reserved",
    "7": "Management's Discussion and Analysis of Financial Condition and Results of Operations",
    "7A": "Quantitative and Qualitative Disclosures About Market Risk",
    "8": "Financial Statements and Supplementary Data",
    "9": "Changes in and Disagreements With Accountants on Accounting and Financial Disclosure",
    "9A": "Controls and Procedures",
    "9B": "Other Information",
    "9C": "Disclosure Regarding Foreign Jurisdictions that Prevent Inspections",
    "10": "Directors, Executive Officers and Corporate Governance",
    "11": "Executive Compensation",
    "12": "Security Ownership of Certain Beneficial Owners and Management and Related Stockholder Matters",
    "13": "Certain Relationships and Related Transactions, and Director Independence",
    "14": "Principal Accountant Fees and Services",
    "15": "Exhibits, Financial Statement Schedules",
    "16": "Form 10-K Summary"
}

ITEM_NAME_MAP_10Q_PART_I = {
    "1": "Financial Statements",
    "2": "Management's Discussion and Analysis of Financial Condition and Results of Operations",
    "3": "Quantitative and Qualitative Disclosures About Market Risk",
    "4": "Controls and Procedures",
}

ITEM_NAME_MAP_10Q_PART_II = {
    "1": "Legal Proceedings", "1A": "Risk Factors",
    "2": "Unregistered Sales of Equity Securities and Use of Proceeds",
    "3": "Defaults Upon Senior Securities", "4": "Mine Safety Disclosures",
    "5": "Other Information", "6": "Exhibits",
}

# =============================================================================
# 2. DATA STRUCTURES FOR BETTER ORGANIZATION
# =============================================================================

@dataclass
class FilingMetadata:
    """Structured metadata for a filing"""
    ticker: str
    form_type: str
    filing_date: str
    fiscal_year: int
    fiscal_quarter: int
    file_path: str

@dataclass
class DocumentSection:
    """Represents a section of the document"""
    title: str
    content: str
    section_type: str  # 'item', 'part', 'intro', 'table'
    item_number: Optional[str] = None
    part: Optional[str] = None
    start_pos: int = 0
    end_pos: int = 0

@dataclass
class Chunk:
    """Final chunk with all metadata"""
    chunk_id: str
    text: str
    token_count: int
    chunk_type: str  # 'narrative', 'table', 'mixed'
    section_info: str
    filing_metadata: FilingMetadata
    chunk_index: int
    has_overlap: bool = False

# =============================================================================
# 3. ROBUST TEXT CLEANING
# =============================================================================

def clean_sec_text(text: str) -> str:
    """
    Clean SEC filing text more robustly
    """
    text = re.sub(r'UNITED STATES\s+SECURITIES AND EXCHANGE COMMISSION.*?FORM \d+[A-Z]*', '', text, flags=re.DOTALL | re.IGNORECASE)
    text = text.replace('[PAGE BREAK]', '\n\n--- PAGE BREAK ---\n\n')
    text = re.sub(r'\[TABLE_START\]', '\n\n=== TABLE START ===\n', text)
    text = re.sub(r'\[TABLE_END\]', '\n=== TABLE END ===\n\n', text)
    text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text)
    text = re.sub(r'[ \t]+', ' ', text)
    text = re.sub(r'^\s+|\s+$', '', text, flags=re.MULTILINE)
    return text.strip()

# =============================================================================
# 4. MULTI-STRATEGY SECTION DETECTION
# =============================================================================

def detect_sections_strategy_1_improved(content: str) -> List[DocumentSection]:
    """
    Improved Strategy 1: Patterns based on real SEC filing structure
    """
    sections = []

    patterns = [
        re.compile(r'^\s*PART\s+([IVX]+)(?:\s*[-–—].*?)?$', re.I | re.M),
        re.compile(r'^PART\s+([IVX]+)(?:\s*[-–—].*?)?$', re.I | re.M),
        re.compile(r'^\s*ITEM\s+(\d{1,2}[A-C]?)(?:[.\s–—])', re.I | re.M),
        re.compile(r'^ITEM\s+(\d{1,2}[A-C]?)(?:[.\s–—])', re.I | re.M),
        re.compile(r'Item\s+(\d{1,2}[A-C]?)(?:[.\s–—])', re.I | re.M),
        re.compile(r'^(\d{1,2}[A-C]?)\.\s+[A-Z][A-Za-z\s]{10,}', re.I | re.M),
        re.compile(r'^.{0,50}(BUSINESS)\s*$', re.I | re.M),
        re.compile(r'^.{0,50}(RISK FACTORS)\s*$', re.I | re.M),
        re.compile(r'^.{0,50}(LEGAL PROCEEDINGS)\s*$', re.I | re.M),
        re.compile(r'^.{0,50}(FINANCIAL STATEMENTS)\s*$', re.I | re.M),
        re.compile(r'^.{0,50}(MANAGEMENT.S DISCUSSION)\s*', re.I | re.M),
        re.compile(r'^.{0,50}(PROPERTIES)\s*$', re.I | re.M),
        re.compile(r'^.{0,50}(CONTROLS AND PROCEDURES)\s*$', re.I | re.M),
    ]

    all_matches = []

    for pattern_idx, pattern in enumerate(patterns):
        for match in pattern.finditer(content):
            line_start = content.rfind('\n', 0, match.start()) + 1
            line_end = content.find('\n', match.end())
            if line_end == -1:
                line_end = len(content)

            full_line = content[line_start:line_end].strip()

            if (len(full_line) > 400 or
                len(full_line) < 3 or
                ('TABLE' in full_line.upper() and ('START' in full_line.upper() or 'END' in full_line.upper())) or
                full_line.count(' ') > 20):
                continue

            if any(toc_indicator in full_line.lower() for toc_indicator in ['table of contents', 'index']):
                continue
            
            section_id = None
            section_title = full_line

            groups = match.groups()
            if groups:
                potential_id = groups[0].strip()
                is_item_id = re.match(r'^\d+[A-C]?$', potential_id, re.I)
                is_part_id = re.match(r'^[IVX]+$', potential_id, re.I)

                if is_item_id or is_part_id:
                    section_id = potential_id
                    if len(groups) > 1 and groups[1]:
                        section_title = groups[1].strip()
                        section_title = re.sub(r'\[TABLE_END\]\s*.*', '', section_title, flags=re.I).strip()
                        section_title = section_title.replace('|', '').strip()
                    else:
                        remaining_line_after_id = full_line[match.end() - line_start:].strip()
                        clean_line = re.sub(r'^\s*\.?\s*[-–—]?\s*', '', remaining_line_after_id).strip()
                        if clean_line and len(clean_line) < 200:
                            section_title = clean_line
                        else:
                             section_title = full_line
                else:
                    section_title = full_line
                    if 'BUSINESS' in full_line.upper() and not is_item_id and not is_part_id: section_id = '1'
                    elif 'RISK FACTORS' in full_line.upper() and not is_item_id and not is_part_id: section_id = '1A'

            all_matches.append({
                'start_pos': match.start(),
                'end_pos': match.end(),
                'full_line': full_line,
                'section_id': section_id if section_id else 'unknown',
                'section_title': section_title,
                'pattern_idx': pattern_idx,
                'match_start': match.start()
            })

    all_matches.sort(key=lambda x: (x['start_pos'], x['pattern_idx']))

    unique_matches = []
    if all_matches:
        unique_matches.append(all_matches[0])
        for i in range(1, len(all_matches)):
            current_match = all_matches[i]
            last_added_match = unique_matches[-1] # Corrected from final_matches to unique_matches

            if current_match['start_pos'] - last_added_match['start_pos'] < 100:
                if current_match['section_id'] != 'unknown' and last_added_match['section_id'] == 'unknown':
                    unique_matches[-1] = current_match
                elif current_match['section_id'] != 'unknown' and last_added_match['section_id'] != 'unknown' and current_match['pattern_idx'] < last_added_match['pattern_idx']:
                    unique_matches[-1] = current_match
                elif current_match['section_id'] == last_added_match['section_id'] and len(current_match['section_title']) < len(last_added_match['section_title']) * 0.8:
                     unique_matches[-1] = current_match
            else:
                unique_matches.append(current_match)

    logger.info(f"🔍 Improved detection found {len(unique_matches)} potential sections:")
    for i, match in enumerate(unique_matches[:15]):
        logger.info(f"  {i+1}: {match['full_line'][:80]}...")

    sections_to_return = []
    current_part = None

    for i, match in enumerate(unique_matches):
        start_pos = match['start_pos']
        end_pos = unique_matches[i + 1]['start_pos'] if i + 1 < len(unique_matches) else len(content)

        section_content = content[start_pos:end_pos].strip()

        full_line_upper = match['full_line'].upper()
        section_id = match['section_id'].upper() if match['section_id'] != 'unknown' else None

        section_type = 'content'
        item_number = None
        part = None
        title = match['section_title']

        if section_id and re.match(r'^[IVX]+$', section_id):
            section_type = 'part'
            part = f"PART {section_id}"
            current_part = part
            if title.upper().startswith("PART ") and title.upper().replace("PART ", "").strip() == section_id:
                title = part
            elif not title:
                title = part
        elif section_id and re.match(r'^\d+[A-C]?$', section_id):
            section_type = 'item'
            item_number = section_id
            part = current_part
            if title.upper().startswith("ITEM ") and title.upper().replace("ITEM ", "").strip() == section_id:
                title = f"Item {item_number}"
            elif not title:
                title = f"Item {item_number}"
        elif any(keyword in full_line_upper for keyword in
                ['BUSINESS', 'RISK', 'LEGAL', 'FINANCIAL', 'MANAGEMENT', 'PROPERTIES', 'CONTROLS']):
            section_type = 'named_section'


        sections_to_return.append(DocumentSection(
            title=title,
            content=section_content,
            section_type=section_type,
            item_number=item_number,
            part=part,
            start_pos=start_pos,
            end_pos=end_pos
        ))

    return sections_to_return


def detect_sections_strategy_2(content: str) -> List[DocumentSection]:
    """
    Strategy 2: Fallback using page breaks and heuristics
    """
    sections = []

    pages = content.split('--- PAGE BREAK ---')

    current_section = ""
    current_title = "Document Content"

    for i, page in enumerate(pages):
        page = page.strip()
        if not page:
            continue

        lines = page.split('\n')
        potential_headers = []

        for j, line in enumerate(lines[:10]):
            line = line.strip()
            if (len(line) < 100 and
                (re.search(r'\b(ITEM|PART)\b', line, re.IGNORECASE) or
                 re.search(r'\b(BUSINESS|RISK FACTORS|FINANCIAL STATEMENTS)\b', line, re.IGNORECASE))):
                potential_headers.append((j, line))

        if potential_headers:
            if current_section:
                sections.append(DocumentSection(
                    title=current_title,
                    content=current_section.strip(),
                    section_type='content',
                    start_pos=0,
                    end_pos=len(current_section)
                ))

            current_title = potential_headers[0][1]
            current_section = page
        else:
            current_section += "\n\n" + page

    if current_section:
        sections.append(DocumentSection(
            title=current_title,
            content=current_section.strip(),
            section_type='content',
            start_pos=0,
            end_pos=len(current_section)
        ))

    return sections

def detect_sections_robust_old(content: str) -> List[DocumentSection]:
    """
    Multi-strategy section detection with fallbacks (original version)
    """
    logger.info("Attempting Strategy 1: Regex-based section detection")
    sections = detect_sections_strategy_1_improved(content) # Corrected argument name for DocumentSection constructor

    if len(sections) >= 3:
        logger.info(f"Strategy 1 successful: Found {len(sections)} sections")
        return sections

    logger.warning("Strategy 1 failed, trying Strategy 2: Page-based detection")
    sections = detect_sections_strategy_2(content)

    if len(sections) >= 2:
        logger.info(f"Strategy 2 successful: Found {len(sections)} sections")
        return sections

    logger.warning("All strategies failed, creating single section")
    return [DocumentSection(
        title="Full Document",
        content=content,
        section_type='document',
        start_pos=0,
        end_pos=len(content)
    )]

def create_section_info(section: DocumentSection, form_type: str) -> str:
    """
    Create human-readable section information for DocumentSection objects,
    using form_type to select the correct item name map.
    Handles 10K/10Q specific mappings and part/item inheritance.
    """
    item_number = section.item_number
    section_type = section.section_type
    part_number = section.part

    if section_type == 'item' and item_number:
        if form_type == '10K':
            item_name = ITEM_NAME_MAP_10K.get(item_number, "Unknown Section")
            return f"Item {item_number} - {item_name}"
        elif form_type == '10Q':
            if part_number == 'PART I':
                item_name = ITEM_NAME_MAP_10Q_PART_I.get(item_number, "Unknown Section")
                return f"Part I, Item {item_number} - {item_name}"
            elif part_number == 'PART II':
                item_name = ITEM_NAME_MAP_10Q_PART_II.get(item_number, "Unknown Section")
                return f"Part II, Item {item_number} - {item_name}"
            else: # Fallback if part not explicitly set for 10Q item
                if item_number in ITEM_NAME_MAP_10Q_PART_I:
                    item_name = ITEM_NAME_MAP_10Q_PART_I[item_number]
                    return f"Part I, Item {item_number} - {item_name}"
                elif item_number in ITEM_NAME_MAP_10Q_PART_II:
                    item_name = ITEM_NAME_MAP_10Q_PART_II[item_number]
                    return f"Part II, Item {item_number} - {item_name}"
                return f"Item {item_number} - Unknown 10Q Section"
    
    elif section_type == 'part' and part_number:
        if "Item" in section.title and section.item_number:
            clean_title_suffix = section.title.replace(part_number, '').strip(' -.')
            return f"{part_number} - {clean_title_suffix}"
        return part_number

    return section.title or "Document Content"


def detect_sections_from_toc_universal(content: str) -> List[DocumentSection]:
    """
    Extract sections from table of contents - works for any SEC filing.
    This function primarily identifies section titles and item numbers from TOC,
    but does not extract their content directly.
    """
    sections = []

    if not content:
        logger.info("Empty content provided to detect_sections_from_toc_universal. Returning empty sections.")
        return sections

    toc_patterns = [
        re.compile(r'(?i)INDEX.*?(?=\s*--- PAGE BREAK ---)', re.DOTALL),
        re.compile(r'(?i)TABLE OF CONTENTS.*?(?=\s*--- PAGE BREAK ---)', re.DOTALL),
        re.compile(r'(?i)FORM 10-[KQ].*?INDEX.*?(?=\s*--- PAGE BREAK ---)', re.DOTALL),
        re.compile(re.escape('[TABLE_START]') + r'.*?Page.*?' + re.escape('[TABLE_END]') + r'.*?(?=\s*--- PAGE BREAK ---)', re.DOTALL),
    ]

    toc_content = ""
    for pattern in toc_patterns:
        match = pattern.search(content)
        if match:
            toc_content = match.group(0)
            break

    if not toc_content:
        logger.warning("No table of contents found in detect_sections_from_toc_universal.")
        return sections

    logger.info(f"Found table of contents ({len(toc_content)} chars)")

    item_patterns = [
        # Pattern 1: Multi-column TOC entry with PART, Item, and Title (e.g., KO 10-Q)
        # Group 1: Optional Page Num | Part ID (Group 2) | Part Title (Group 3) | Item ID (Group 4) | Item Title (Group 5)
        re.compile(r'(?i)(?:Page\s*\|\s*)?\s*(PART\s*([IVX]+)\.?(?:\s*([^\n|]+?))?\s*\|\s*)?Item\s*(\d{1,2}[A-C]?)\.?\s*\|\s*([^|]+?)(?:\s*\|\s*\d+)?', re.M),
        
        # Pattern 2: Simpler Item/Part line with Title, pipe-separated. Catches "Item 1. | Financial Statements | 3"
        # Group 1: Item/PART ID, Group 2: Title
        re.compile(r'(?i)(?:Item|PART)\s*(\d{1,2}[A-C]?|[IVX]+)\.?\s*\|\s*([^\n|]+?)(?:\s*\|\s*\d+)?', re.M),
        
        # Pattern 3: Standalone Item/Part line with Title (no pipes separating title)
        # Group 1: Item/PART ID, Group 2: Title
        re.compile(r'(?i)^\s*(?:Item|PART)\s*(\d{1,2}[A-C]?|[IVX]+)\.?\s*([^\n|]+)', re.M),
        
        # Pattern 4: Generic TOC titles, often sub-sections or long descriptions.
        # Group 1: Title
        re.compile(r'^\s*([A-Z][A-Za-z0-9\s\',&\(\)\-\.]{15,})\s*(?:\|\s*\d+)?$', re.M),
        
        # Pattern 5: Simple "PART X" line
        # Group 1: PART ID
        re.compile(r'(?i)^\s*PART\s*([IVX]+)\s*$', re.M),
        
        # Pattern 6: Number-dot format (e.g., "1. Business") usually at start of line
        # Group 1: Item ID, Group 2: Title
        re.compile(r'^\s*(\d{1,2}[A-C]?)\.\s*([^\n|]+)', re.M),
    ]

    found_items = []
    current_part_id_context = None

    if toc_content:
        for line in toc_content.split('\n'):
            line = line.strip()
            if not line:
                continue
            
            if any(kw in line.lower() for kw in ['page', 'signatures', 'exhibit', 'index', 'table of contents']) and len(line) < 30:
                continue
            if re.match(r'^\s*\d+\s*$', line.strip()):
                continue
            if re.match(r'^\s*(\d{1,2}[A-C]?)\s*$', line.strip()):
                continue
            if len(line) < 5:
                continue
            if re.search(r'\d+\s*$', line.strip()) and not re.match(r'(?:Item|PART)\s*(\d{1,2}[A-C]?|[IVX]+)\.?', line, re.I):
                continue


            for pattern in item_patterns:
                match = pattern.search(line)
                if match:
                    item_id = None
                    item_title = ""
                    section_type_raw = 'unknown'

                    if pattern == item_patterns[0]: # Pattern 1: Complex multi-column TOC
                        part_id_cand = match.group(2) if len(match.groups()) >= 2 and match.group(2) else None
                        part_title_from_group = match.group(3) if len(match.groups()) >= 3 and match.group(3) else None
                        item_id = match.group(4).strip() if len(match.groups()) >= 4 and match.group(4) else None
                        item_title = match.group(5).strip() if len(match.groups()) >= 5 and match.group(5) else ""
                        
                        if part_id_cand:
                            current_part_id_context = f"PART {part_id_cand.strip()}"
                            title_for_part = part_title_from_group.strip() if part_title_from_group else f"PART {part_id_cand.strip()}"
                            found_items.append((part_id_cand.strip(), title_for_part, 'part', current_part_id_context))
                        
                        if item_id:
                            section_type_raw = 'item'
                            title_for_item = item_title.strip() if item_title else f"Item {item_id.strip()}"
                            found_items.append((item_id.strip(), title_for_item, section_type_raw, current_part_id_context))
                            break

                    elif pattern in [item_patterns[1], item_patterns[2], item_patterns[5]]: # Patterns with ID as group 1, Title as group 2 (or inferred from line)
                        item_id = match.group(1).strip() if match.group(1) else None
                        item_title = match.group(2).strip() if len(match.groups()) > 1 and match.group(2) else ""

                        is_item = re.match(r'^\d+[A-C]?$', item_id, re.I)
                        is_part = re.match(r'^[IVX]+$', item_id, re.I)

                        if is_item:
                            section_type_raw = 'item'
                            found_items.append((item_id, item_title, section_type_raw, current_part_id_context))
                            break
                        elif is_part:
                            section_type_raw = 'part'
                            current_part_id_context = f"PART {item_id}"
                            found_items.append((item_id, item_title, section_type_raw, current_part_id_context))
                            break
                    
                    elif pattern == item_patterns[3]: # Generic titles (Pattern 4: e.g., "Consolidated Statements of Cash Flows")
                        item_title = match.group(1).strip()
                        if item_title and len(item_title) > 10 and not re.match(r'^\d+(\.\d+)?$', item_title.replace('.', '').strip()):
                             found_items.append((None, item_title, 'named_section', current_part_id_context))
                             break
                    
                    elif pattern == item_patterns[4]: # Simple "PART X" line (Pattern 5)
                        item_id = match.group(1).strip()
                        current_part_id_context = f"PART {item_id}"
                        found_items.append((item_id, f"PART {item_id}", 'part', current_part_id_context))
                        break

    unique_items = []
    seen_keys = set()
    
    processed_items_for_dedup = []
    for item_data in found_items:
        item_id, title_raw, section_type_raw, part_context = item_data
        
        cleaned_title = re.sub(r'\|\s*\d+\s*$', '', title_raw).strip()
        cleaned_title = re.sub(r'\s*\.\s*$', '', cleaned_title).strip()
        cleaned_title = re.sub(r'\[TABLE_END\]\s*.*', '', cleaned_title, flags=re.I).strip()
        cleaned_title = re.sub(r'\s+', ' ', cleaned_title).strip()
        
        if not cleaned_title or len(cleaned_title) < 5 or re.match(r'^\d+(\.\d+)?$', cleaned_title):
            continue

        processed_items_for_dedup.append({
            'item_id': item_id,
            'title': cleaned_title,
            'type': section_type_raw,
            'part': part_context
        })

    processed_items_for_dedup.sort(key=lambda x: (x['part'] if x['part'] else '', x['item_id'] if x['item_id'] else '', x['title']))

    for item in processed_items_for_dedup:
        key = (item['item_id'], item['title'], item['type'], item['part'])
        if key not in seen_keys:
            unique_items.append(DocumentSection(
                title=item['title'],
                content="",
                section_type=item['type'],
                item_number=item['item_id'] if item['type'] == 'item' else None,
                part=item['part'],
                start_pos=0,
                end_pos=0
            ))
            seen_keys.add(key)
    
    logger.info(f"Extracted {len(unique_items)} sections from table of contents:")
    for i, sec in enumerate(unique_items[:15]):
        logger.info(f"  • ID: {sec.item_number if sec.item_number else sec.part if sec.part else 'None'}, Type: {sec.section_type}, Title: {sec.title[:60]}...")

    return unique_items


def detect_sections_robust_universal(content: str) -> List[DocumentSection]:
    """
    Universal robust section detection for all SEC filings.
    Prioritizes direct pattern matching (which handles tables well), then TOC, then page-based.
    """
    logger.info("Attempting universal SEC section detection")

    sections_strategy1 = detect_sections_universal_sec(content)

    if len(sections_strategy1) >= 3:
        logger.info(f"Universal detection successful (Strategy 1): Found {len(sections_strategy1)} sections.")
        return sections_strategy1

    logger.warning("Direct detection found few sections, analyzing table of contents.")
    toc_entries = detect_sections_from_toc_universal(content)

    if toc_entries and len(toc_entries) >= 3:
        logger.info(f"TOC analysis found {len(toc_entries)} potential sections. Attempting to extract content based on TOC titles.")

        combined_sections = []
        current_content_pos = 0

        for i, toc_entry in enumerate(toc_entries):
            pattern_parts = []
            
            if toc_entry.item_number:
                pattern_parts.append(r'Item\s*' + re.escape(toc_entry.item_number) + r'\.?')
            if toc_entry.part and toc_entry.part.startswith("PART "):
                pattern_parts.append(r'PART\s*' + re.escape(toc_entry.part.replace("PART ", "")) + r'\.?')
            
            if toc_entry.title:
                cleaned_title_for_regex = re.sub(r'\|\s*\d+', '', toc_entry.title).strip()
                cleaned_title_for_regex = re.sub(r'\s*\.\s*$', '', cleaned_title_for_regex).strip()
                cleaned_title_for_regex = re.sub(r'\s+-\s+', r'\s*[-–—]?\s*', cleaned_title_for_regex)
                cleaned_title_for_regex = re.sub(r'\s+', r'\s+', cleaned_title_for_regex)
                
                if len(cleaned_title_for_regex) > 5:
                    pattern_parts.append(r'\b?' + re.escape(cleaned_title_for_regex) + r'\b?')
                else:
                    pattern_parts.append(re.escape(cleaned_title_for_regex))
                
            if not pattern_parts:
                logger.warning(f"No valid pattern parts for TOC entry: '{toc_entry.title}'. Skipping.")
                continue

            search_pattern = re.compile(r'(?i)^\s*(?:' + '|'.join(pattern_parts) + r')', re.M)
            
            match = search_pattern.search(content, pos=current_content_pos)

            if match:
                start_pos = match.start()
                
                next_start_pos = len(content)
                if i + 1 < len(toc_entries):
                    next_toc_entry = toc_entries[i+1]
                    next_pattern_parts = []
                    if next_toc_entry.item_number:
                        next_pattern_parts.append(r'Item\s*' + re.escape(next_toc_entry.item_number) + r'\.?')
                    elif next_toc_entry.part and next_toc_entry.part.startswith("PART "):
                        next_pattern_parts.append(r'PART\s*' + re.escape(next_toc_entry.part.replace("PART ", "")) + r'\.?')
                    if next_toc_entry.title:
                        next_cleaned_title_for_regex = re.sub(r'\|\s*\d+', '', next_toc_entry.title).strip()
                        next_cleaned_title_for_regex = re.sub(r'\s*\.\s*$', '', next_cleaned_title_for_regex).strip()
                        next_cleaned_title_for_regex = re.sub(r'\s+-\s+', r'\s*[-–—]?\s*', next_cleaned_title_for_regex)
                        next_cleaned_title_for_regex = re.sub(r'\s+', r'\s+', next_cleaned_title_for_regex)
                        if len(next_cleaned_title_for_regex) > 5:
                            next_pattern_parts.append(r'\b?' + re.escape(next_cleaned_title_for_regex) + r'\b?')
                        else:
                            next_pattern_parts.append(re.escape(next_cleaned_title_for_regex))

                    if next_pattern_parts:
                        next_pattern = re.compile(r'(?i)^\s*(?:' + '|'.join(next_pattern_parts) + r')', re.M)
                        next_match = next_pattern.search(content, pos=match.end())
                        if next_match:
                            next_start_pos = next_match.start()
                
                section_content = content[start_pos:next_start_pos].strip()
                
                combined_sections.append(DocumentSection(
                    title=toc_entry.title,
                    content=section_content,
                    section_type=toc_entry.section_type,
                    item_number=toc_entry.item_number,
                    part=toc_entry.part,
                    start_pos=start_pos,
                    end_pos=next_start_pos
                ))
                current_content_pos = next_start_pos
            else:
                logger.warning(f"Could not find content for TOC entry: '{toc_entry.title}'. This section might be merged with previous or skipped.")

        if len(combined_sections) >= 3:
            logger.info(f"Universal detection successful (TOC-based content mapping): Found {len(combined_sections)} sections.")
            return combined_sections
        else:
            logger.warning("TOC-based content mapping yielded few sections. Falling back to page-based detection.")


    logger.warning("Trying page-based detection as fallback.")
    sections_strategy2 = detect_sections_strategy_2(content)

    if len(sections_strategy2) >= 2:
        logger.info(f"Page-based detection successful: Found {len(sections_strategy2)} sections.")
        return sections_strategy2

    logger.warning("All strategies failed, creating single section.")
    return [DocumentSection(
        title="Full Document",
        content=content,
        section_type='document',
        start_pos=0,
        end_pos=len(content)
    )]

# =============================================================================
# MAIN PROCESSING FUNCTION (Universal)
# =============================================================================
def process_filing_robust_universal(file_path: str, target_tokens: int = 500, overlap_tokens: int = 100) -> List[Chunk]:
    """
    Universal processing function for all SEC filings
    """
    try:
        filing_metadata = extract_metadata_from_filename(file_path)
        filename = Path(file_path).name
        file_id = filename.replace(".txt", "")

        with open(file_path, 'r', encoding='utf-8') as f:
            raw_content = f.read()
        cleaned_content = clean_sec_text(raw_content)

        if not cleaned_content.strip():
            logger.warning(f"Cleaned content for {filename} is empty. No chunks created.")
            return []

        sections = detect_sections_robust_universal(cleaned_content)
        logger.info(f"Found {len(sections)} sections in {filename}")

        all_chunks = []
        chunk_counter = 0

        for section in sections:
            # DEBUG: Log content length of incoming section
            logger.debug(f"Processing section: '{section.title}', Content len: {len(section.content)}, Start: {section.start_pos}, End: {section.end_pos}")

            if not section.content.strip():
                continue

            tables_in_section, narrative_content_in_section = extract_and_process_tables(section.content)

            section_info = create_section_info(section, filing_metadata.form_type)

            for table in tables_in_section:
                chunk = Chunk(
                    chunk_id=f"{file_id}-chunk-{chunk_counter:04d}",
                    text=table['text'],
                    token_count=table['token_count'],
                    chunk_type='table',
                    section_info=section_info,
                    filing_metadata=filing_metadata,
                    chunk_index=chunk_counter,
                    has_overlap=False
                )
                all_chunks.append(chunk)
                chunk_counter += 1

            if narrative_content_in_section.strip():
                narrative_sub_chunks = create_overlapping_chunks(
                    narrative_content_in_section, target_tokens, overlap_tokens
                )

                for chunk_data in narrative_sub_chunks:
                    chunk = Chunk(
                        chunk_id=f"{file_id}-chunk-{chunk_counter:04d}",
                        text=chunk_data['text'],
                        token_count=chunk_data['token_count'],
                        chunk_type='narrative',
                        section_info=section_info,
                        filing_metadata=filing_metadata,
                        chunk_index=chunk_counter,
                        has_overlap=chunk_data['has_overlap']
                    )
                    all_chunks.append(chunk)
                    chunk_counter += 1

        logger.info(f"Created {len(all_chunks)} chunks for {filename}")
        return all_chunks

    except Exception as e:
        logger.error(f"Error processing {file_path}: {e}")
        return []

# =============================================================================
# 5. IMPROVED SENTENCE-AWARE CHUNKING
# =============================================================================

def split_into_sentences(text: str) -> List[str]:
    """
    Split text into sentences using multiple heuristics
    """
    sentences = re.split(r'(?<=[.!?])\s+(?=[A-Z])', text)

    sentences = [s.strip() for s in sentences if s.strip()]

    return sentences

def create_overlapping_chunks(text: str, target_tokens: int = 500, overlap_tokens: int = 100,
                            min_tokens: int = 50) -> List[Dict[str, Any]]:
    """
    Create semantically aware chunks with overlap
    """
    sentences = split_into_sentences(text)
    chunks = []

    current_chunk_sentences = []
    current_tokens = 0

    for i, sentence in enumerate(sentences):
        sentence_tokens = len(encoding.encode(sentence))

        if current_tokens + sentence_tokens > target_tokens and current_chunk_sentences:
            chunk_text = ' '.join(current_chunk_sentences)
            chunks.append({
                'text': chunk_text,
                'token_count': current_tokens,
                'sentence_count': len(current_chunk_sentences),
                'has_overlap': len(chunks) > 0
            })

            overlap_sentences = []
            current_overlap_tokens = 0

            for sent_idx in range(len(current_chunk_sentences) - 1, -1, -1):
                sent = current_chunk_sentences[sent_idx]
                sent_tokens = len(encoding.encode(sent))
                if current_overlap_tokens + sent_tokens <= overlap_tokens:
                    overlap_sentences.insert(0, sent)
                    current_overlap_tokens += sent_tokens
                else:
                    break
            
            if not overlap_sentences and current_chunk_sentences:
                overlap_sentences = [current_chunk_sentences[-1]]
                current_overlap_tokens = len(encoding.encode(overlap_sentences[0]))


            current_chunk_sentences = overlap_sentences + [sentence]
            current_tokens = current_overlap_tokens + sentence_tokens
        else:
            current_chunk_sentences.append(sentence)
            current_tokens += sentence_tokens

    if current_chunk_sentences:
        chunk_text = ' '.join(current_chunk_sentences)
        final_tokens = len(encoding.encode(chunk_text))

        if final_tokens >= min_tokens:
            chunks.append({
                'text': chunk_text,
                'token_count': final_tokens,
                'sentence_count': len(current_chunk_sentences),
                'has_overlap': len(chunks) > 0
            })

    return chunks

# =============================================================================
# 6. TABLE HANDLING
# =============================================================================

def extract_and_process_tables(content: str) -> Tuple[List[Dict], str]:
    """
    Extract tables and return both table chunks and narrative text
    """
    table_pattern = re.compile(r'=== TABLE START ===.*?=== TABLE END ===', re.DOTALL)
    tables = []

    for i, match in enumerate(table_pattern.finditer(content)):
        table_content = match.group(0)
        table_text = table_content.replace('=== TABLE START ===', '').replace('=== TABLE END ===', '').strip()

        if table_text:
            tables.append({
                'text': table_text,
                'token_count': len(encoding.encode(table_text)),
                'table_index': i,
                'chunk_type': 'table'
            })

    narrative_content = table_pattern.sub('', content).strip()

    return tables, narrative_content

# =============================================================================
# 8. TESTING AND VALIDATION
# =============================================================================

def validate_chunks(chunks: List[Chunk]) -> Dict[str, Any]:
    """
    Validate the quality of our chunks
    """
    if not chunks:
        return {"error": "No chunks created"}

    token_counts = [chunk.token_count for chunk in chunks]

    stats = {
        "total_chunks": len(chunks),
        "avg_tokens": sum(token_counts) / len(token_counts),
        "min_tokens": min(token_counts),
        "max_tokens": max(token_counts),
        "chunks_with_overlap": sum(1 for chunk in chunks if chunk.has_overlap),
        "table_chunks": sum(1 for chunk in chunks if chunk.chunk_type == 'table'),
        "narrative_chunks": sum(1 for chunk in chunks if chunk.chunk_type == 'narrative'),
        "unique_sections": len(set(chunk.section_info for chunk in chunks))
    }

    return stats

# =============================================================================
# 9. LET'S TEST THIS!
# =============================================================================

print("🚀 SEC Filing Preprocessing Strategy - Ready for Testing!\n")
print("="*60)
print("Key improvements over original approach:\n")
print("✅ Multi-strategy section detection with fallbacks\n")
print("✅ Sentence-aware chunking with overlap\n")
print("✅ Robust error handling and logging\n")
print("✅ Structured data classes for better organization\n")
print("✅ Quality validation and statistics\n")
print("✅ Separate table and narrative processing\n")
print("="*60)


def test_single_file():
    """Test our preprocessing on a single file"""
    test_file = "processed_filings/AAPL/AAPL_10K_2020-10-30.txt"

    if os.path.exists(test_file):
        print(f"🧪 Testing with: {test_file}\n")
        print("="*50)

        chunks = process_filing_robust_universal(test_file)
        stats = validate_chunks(chunks)

        print("📊 Processing Results:\n")
        for key, value in stats.items():
            print(f"  {key}: {value}\n")

        print("\n📝 Sample Chunks:\n")
        for i, chunk in enumerate(chunks[:3]):
            print(f"\nChunk {i+1} ({chunk.chunk_type}):\n")
            print(f"  Section: {chunk.section_info}\n")
            print(f"  Tokens: {chunk.token_count}\n")
            print(f"  Text preview: {chunk.text[:200]}...\n")

        return chunks
    else:
        print(f"❌ File not found: {test_file}\n")
        print("Please update the file path to match your data structure\n")
        return []

chunks = test_single_file()

def compare_section_strategies(content: str):
    """Compare how different strategies perform"""
    print("🔍 Comparing Section Detection Strategies\n")
    print("="*50)

    sections_1 = detect_sections_strategy_1_improved(content)
    print(f"Strategy 1 (Regex): {len(sections_1)} sections\n")
    for i, section in enumerate(sections_1[:5]):
        print(f"  {i+1}. {section.title[:60]}...\n")

    print()

    sections_2 = detect_sections_strategy_2(content)
    print(f"Strategy 2 (Page-based): {len(sections_2)} sections\n")
    for i, section in enumerate(sections_2[:5]):
        print(f"  {i+1}. {section.title[:60]}...\n")

    return sections_1, sections_2

if chunks:
    test_file = chunks[0].filing_metadata.file_path
    with open(test_file, 'r', encoding='utf-8') as f:
        full_content_for_comparison = f.read()
    cleaned_content_for_comparison = clean_sec_text(full_content_for_comparison)

    sections_1_comp, sections_2_comp = compare_section_strategies(cleaned_content_for_comparison)


def analyze_chunking_quality(chunks: List[Chunk]):
    """Deep dive into chunk quality"""
    if not chunks:
        print("No chunks to analyze\n")
        return

    print("📊 Chunking Quality Analysis\n")
    print("="*50)

    token_counts = [chunk.token_count for chunk in chunks]

    print(f"Token Distribution:\n")
    print(f"  Mean: {sum(token_counts)/len(token_counts):.1f}\n")
    print(f"  Median: {sorted(token_counts)[len(token_counts)//2]}\n")
    print(f"  Min: {min(token_counts)}\n")
    print(f"  Max: {max(token_counts)}\n")

    print(f"\nChunk Types:\n")
    chunk_types = {}
    for chunk in chunks:
        chunk_types[chunk.chunk_type] = chunk_types.get(chunk.chunk_type, 0) + 1
    for chunk_type, count in chunk_types.items():
        print(f"  {chunk_type}: {count}\n")

    print(f"\nSection Distribution:\n")
    sections_dist = {}
    for chunk in chunks:
        sections_dist[chunk.section_info] = sections_dist.get(chunk.section_info, 0) + 1
    for section, count in sorted(sections_dist.items()):
        print(f"  {section}: {count} chunks\n")

    overlap_count = sum(1 for chunk in chunks if chunk.has_overlap)
    print(f"\nOverlap Analysis:\n")
    print(f"  Chunks with overlap: {overlap_count}/{len(chunks)} ({overlap_count/len(chunks)*100:.1f}%)\n")

    return {
        'token_stats': {
            'mean': sum(token_counts)/len(token_counts),
            'median': sorted(token_counts)[len(token_counts)//2],
            'min': min(token_counts),
            'max': max(token_counts)
        },
        'chunk_types': chunk_types,
        'sections': sections_dist,
        'overlap_rate': overlap_count/len(chunks)
    }

if chunks:
    quality_analysis = analyze_chunking_quality(chunks)


def test_chunking_parameters():
    """Test different parameter combinations"""
    if not chunks:
        print("No test file processed yet\n")
        return

    test_file = chunks[0].filing_metadata.file_path

    print("🔧 Testing Different Chunking Parameters\n")
    print("="*50)

    param_configs = [
        {"target_tokens": 300, "overlap_tokens": 50, "name": "Small chunks, low overlap"},
        {"target_tokens": 500, "overlap_tokens": 100, "name": "Medium chunks, medium overlap"},
        {"target_tokens": 800, "overlap_tokens": 150, "name": "Large chunks, high overlap"},
    ]

    results = {}

    for config in param_configs:
        print(f"\n🧪 Testing: {config['name']}\n")
        test_chunks = process_filing_robust_universal(
            test_file,
            target_tokens=config['target_tokens'],
            overlap_tokens=config['overlap_tokens']
        )

        stats = validate_chunks(test_chunks)
        results[config['name']] = stats

        print(f"  Total chunks: {stats['total_chunks']}\n")
        print(f"  Avg tokens: {stats['avg_tokens']:.1f}\n")
        print(f"  Overlap rate: {stats['chunks_with_overlap']}/{stats['total_chunks']}\n")

    return results

param_results = test_chunking_parameters()


def test_error_handling():
    """Test how our system handles various edge cases"""
    print("🛡️ Testing Error Handling\n")
    print("="*50)

    print("Test 1: Non-existent file\n")
    fake_chunks = process_filing_robust_universal("non_existent_file.txt")
    print(f"  Result: {len(fake_chunks)} chunks (expected 0)\n")

    print("\nTest 2: Empty content\n")
    empty_sections = detect_sections_robust_universal("")
    print(f"  Result: {len(empty_sections)} sections\n")

    print("\nTest 3: Malformed filename\n")
    import tempfile
    with tempfile.NamedTemporaryFile(mode='w', suffix='_bad_name.txt', delete=False) as f:
        f.write("Some content")
        temp_file = f.name

    bad_chunks = process_filing_robust_universal(temp_file)
    print(f"  Result: {len(bad_chunks)} chunks (expected 0)\n")

    os.unlink(temp_file)

    print("\nTest 4: Very short text\n")
    short_chunks = create_overlapping_chunks("Short text.", target_tokens=500)
    print(f"  Result: {len(short_chunks)} chunks\n")

test_error_handling()


def test_batch_processing(max_files: int = 5):
    """Test processing multiple files"""
    print(f"🔄 Testing Batch Processing (max {max_files} files)\n")
    print("="*50)

    data_path = "processed_filings/"
    if not os.path.exists(data_path):
        print(f"❌ Data path not found: {data_path}\n")
        return []

    all_files = []
    for root, dirs, files in os.walk(data_path):
        for file in files:
            if file.endswith('.txt'):
                all_files.append(os.path.join(root, file))

    test_files = all_files[:max_files]
    print(f"Processing {len(test_files)} files...\n")

    all_results = []

    for i, file_path in enumerate(test_files):
        print(f"  {i+1}/{len(test_files)}: {os.path.basename(file_path)}\n")

        file_chunks = process_filing_robust_universal(file_path)
        stats = validate_chunks(file_chunks)

        all_results.append({
            'file': os.path.basename(file_path),
            'chunks': len(file_chunks),
            'avg_tokens': stats.get('avg_tokens', 0),
            'sections': stats.get('unique_sections', 0),
            'tables': stats.get('table_chunks', 0)
        })

    print(f"\n📊 Batch Processing Summary:\n")
    total_chunks = sum(r['chunks'] for r in all_results)
    avg_chunks_per_file = total_chunks / len(all_results) if all_results else 0

    print(f"  Total files processed: {len(all_results)}\n")
    print(f"  Total chunks created: {total_chunks}\n")
    print(f"  Average chunks per file: {avg_chunks_per_file:.1f}\n")

    print(f"\n📋 Per-file results:\n")
    for result in all_results:
        print(f"  {result['file']}: {result['chunks']} chunks, {result['sections']} sections, {result['tables']} tables\n")

    return all_results

batch_results = test_batch_processing(max_files=3)


def create_analysis_summary():
    """Create a comprehensive summary of our preprocessing"""
    print("📈 Final Analysis Summary\n")
    print("="*60)

    if 'chunks' not in globals() or not chunks:
        print("No chunks to analyze - run test_single_file() first\n")
        return

    chunk_data = []
    for chunk in chunks:
        chunk_data.append({
            'chunk_id': chunk.chunk_id,
            'tokens': chunk.token_count,
            'type': chunk.chunk_type,
            'section': chunk.section_info,
            'has_overlap': chunk.has_overlap,
            'ticker': chunk.filing_metadata.ticker,
            'form_type': chunk.filing_metadata.form_type,
            'fiscal_year': chunk.filing_metadata.fiscal_year
        })

    df = pd.DataFrame(chunk_data)

    print("🎯 Key Insights:\n")
    print(f"  • Document: {df['ticker'].iloc[0]} {df['form_type'].iloc[0]} (FY{df['fiscal_year'].iloc[0]})\n")
    print(f"  • Total chunks: {len(df)}\n")
    print(f"  • Average chunk size: {df['tokens'].mean():.0f} tokens\n")
    print(f"  • Size range: {df['tokens'].min()} - {df['tokens'].max()} tokens\n")
    print(f"  • Overlap rate: {(df['has_overlap'].sum() / len(df) * 100):.1f}%\n")

    print(f"\n📊 Chunk Distribution by Type:\n")
    type_dist = df['type'].value_counts()
    for chunk_type, count in type_dist.items():
        percentage = (count / len(df)) * 100
        print(f"  • {chunk_type}: {count} chunks ({percentage:.1f}%)\n")

    print(f"\n📚 Section Breakdown:\n")
    section_dist = df['section'].value_counts()
    for section, count in section_dist.head(8).items():
        print(f"  • {section}: {count} chunks\n")

    print(f"\n✅ Quality Metrics:\n")
    small_chunks = df[df['tokens'] < 50]
    print(f"  • Very small chunks (<50 tokens): {len(small_chunks)} ({len(small_chunks)/len(df)*100:.1f}%)\n")

    large_chunks = df[df['tokens'] > 800]
    print(f"  • Large chunks (>800 tokens): {len(large_chunks)} ({len(large_chunks)/len(df)*100:.1f}%)\n")

    unique_sections = df['section'].nunique()
    print(f"  • Unique sections identified: {unique_sections}\n")

    print(f"\n🔍 Sample Chunks for Review:\n")
    for chunk_type in df['type'].unique():
        sample = df[df['type'] == chunk_type].iloc[0]
        chunk_obj = next(c for c in chunks if c.chunk_id == sample['chunk_id'])
        print(f"\n  {chunk_type.upper()} example ({sample['tokens']} tokens):\n")
        print(f"    Section: {sample['section']}\n")
        print(f"    Preview: {chunk_obj.text[:150]}...\n")

    return df

summary_df = create_analysis_summary()


def compare_with_original():
    """Compare our approach with the original chunking strategy"""
    print("⚖️ Comparison: New vs Original Approach\n")
    print("="*60)

    improvements = [
        "✅ Multi-strategy section detection (fallbacks for robustness)",
        "✅ Sentence-aware chunking (preserves semantic boundaries)",
        "✅ Overlapping chunks (maintains context across boundaries)",
        "✅ Separate table processing (handles structured data better)",
        "✅ Comprehensive error handling (graceful degradation)",
        "✅ Rich metadata structure (better for search/filtering)",
        "✅ Quality validation (ensures chunk coherence)",
        "✅ Configurable parameters (tunable for different use cases)"
    ]

    potential_tradeoffs = [
        "⚠️ Slightly more complex code (but more maintainable)",
        "⚠️ More chunks due to overlap (but better retrieval)",
        "⚠️ Processing takes longer (but more robust results)"
    ]

    print("🚀 Key Improvements:\n")
    for improvement in improvements:
        print(f"  {improvement}\n")

    print(f"\n⚖️ Potential Tradeoffs:\n")
    for tradeoff in potential_tradeoffs:
        print(f"  {tradeoff}\n")

    print(f"\n🎯 Recommended Next Steps:\n")
    next_steps = [
        "1. Test on more diverse filings to validate robustness",
        "2. Fine-tune chunking parameters based on embedding performance",
        "3. Add semantic similarity checks between overlapping chunks",
        "4. Implement incremental processing for large datasets",
        "5. Add support for other SEC forms (8-K, DEF 14A, etc.)",
        "6. Create embedding quality metrics and evaluation"
    ]

    for step in next_steps:
        print(f"  {step}\n")

    print("\n" + "="*60)
    print("🎉 Preprocessing Strategy Testing Complete!\n")
    print("="*60)
    print("Next step: Convert this notebook into modular Python files\n")
    print("Then: Implement the embedding pipeline and MCP server!\n")
    print("="*60)

compare_with_original()

print("🚀 Ready to test universal SEC detection!\n")
print("\n1. Run test_universal_detection_fixed() to test all files\n")
print("2. Run compare_old_vs_universal_fixed() to see the improvement\n")
print("3. Run quick_pattern_test_fixed() to see what patterns match\n")

def extract_metadata_from_filename(file_path: str) -> FilingMetadata:
    filename = Path(file_path).name
    file_id = filename.replace(".txt", "")
    parts = file_id.split('_')

    if len(parts) != 3:
        logger.warning(f"Malformed filename: {filename}. Using default metadata.")
        return FilingMetadata(
            ticker="UNKNOWN",
            form_type="UNKNOWN",
            filing_date="1900-01-01",
            fiscal_year=1900,
            fiscal_quarter=1,
            file_path=file_path
        )

    ticker, form_type, filing_date_str = parts

    try:
        filing_date = pd.to_datetime(filing_date_str)
        fiscal_year = filing_date.year
        fiscal_quarter = filing_date.quarter
    except pd.errors.ParserError:
        logger.error(f"Could not parse filing date from {filing_date_str} in {filename}. Using default values.")
        fiscal_year = 1900
        fiscal_quarter = 1

    if form_type == '10K' and filing_date.month <= 3:
        fiscal_year -= 1

    return FilingMetadata(
        ticker=ticker,
        form_type=form_type,
        filing_date=filing_date_str,
        fiscal_year=fiscal_year,
        fiscal_quarter=fiscal_quarter,
        file_path=file_path
    )


def test_universal_detection_fixed():
    """Test the universal detection on all your file types"""

    test_files = [
        "processed_filings/AAPL/AAPL_10K_2020-10-30.txt",
        "processed_filings/AMZN/AMZN_10K_2023-02-03.txt",
        "processed_filings/AMZN/AMZN_10Q_2024-11-01.txt",
        "processed_filings/KO/KO_10Q_2020-07-22.txt"
    ]

    results = {}

    for test_file in test_files:
        if not os.path.exists(test_file):
            print(f"⚠️ Skipping {test_file} - file not found\n")
            continue

        print(f"\n🧪 Testing: {test_file}\n")
        print("=" * 80)

        with open(test_file, 'r', encoding='utf-8') as f:
            content = f.read()

        sections = detect_sections_robust_universal(content)

        print(f"\n✅ Found {len(sections)} sections:\n")
        for i, section in enumerate(sections[:10]):
            print(f"  {i+1}. {section.title}\n")
            print(f"     Type: {section.section_type}, Length: {len(section.content):,} chars\n")

        chunks = process_filing_robust_universal(test_file)
        stats = validate_chunks(chunks) if chunks else {"error": "No chunks created"}

        results[test_file] = {
            'sections': len(sections),
            'chunks': len(chunks) if chunks else 0,
            'stats': stats
        }

        print(f"\n📊 Processing Results:\n")
        for key, value in stats.items():
            print(f"  {key}: {value}\n")

        if chunks:
            section_counts = {}
            for chunk in chunks[:20]:
                section = chunk.section_info
                section_counts[section] = section_counts.get(section, 0) + 1

            print(f"\n📚 Section Distribution (sample):\n")
            for section, count in sorted(section_counts.items()):
                print(f"  • {section}: {count} chunks\n")

    print(f"\n" + "="*80)
    print("📊 UNIVERSAL DETECTION SUMMARY\n")
    print("="*80)

    for file_path, result in results.items():
        filename = file_path.split('/')[-1]
        print(f"{filename:<25} | {result['sections']:>2} sections | {result['chunks']:>3} chunks\n")

    return results

def compare_old_vs_universal_fixed():
    """Compare the old detection vs universal detection"""
    test_file = "processed_filings/AAPL/AAPL_10K_2020-10-30.txt"

    if not os.path.exists(test_file):
        print("Test file not found for comparison\n")
        return

    print("⚖️ OLD vs UNIVERSAL Detection Comparison\n")
    print("="*60)

    with open(test_file, 'r', encoding='utf-8') as f:
        content = f.read()

    print("Running old detection...\n")
    old_sections = detect_sections_robust_old(content)

    print("Running universal detection...\n")
    new_sections = detect_sections_robust_universal(content)

    print(f"\n📊 Comparison Results:\n")
    print(f"  Old detection: {len(old_sections)} sections\n")
    print(f"  Universal detection: {len(new_sections)} sections\n")
    print(f"  Improvement: +{len(new_sections) - len(old_sections)} sections\n")

    print(f"\n📋 Old Sections:\n")
    for i, section in enumerate(old_sections):
        print(f"  {i+1}. {section.title}\n")

    print(f"\n📋 Universal Sections:\n")
    for i, section in enumerate(new_sections):
        print(f"  {i+1}. {section.title}\n")

    return old_sections, new_sections

def quick_pattern_test_fixed():
    """Quick test to see what patterns match in your content"""
    test_file = "processed_filings/AAPL/AAPL_10K_2020-10-30.txt"

    if not os.path.exists(test_file):
        print("Test file not found\n")
        return

    print("🔍 QUICK PATTERN TEST\n")
    print("="*50)

    with open(test_file, 'r', encoding='utf-8') as f:
        content = f.read()

    patterns = [
        (re.compile(r'\[TABLE_START\](?:.|\n)*?Item(?:.|\n)*?\[TABLE_END\]', re.I | re.DOTALL), "Table-wrapped Items"),
        (re.compile(r'Item\s+\d+[A-C]?\.\s*\|', re.I), "Pipe-separated Items"),
        (re.compile(r'PART\s+[IVX]+', re.I), "Part headers"),
        (re.compile(r'\[TABLE_START\](?:.|\n)*?PART(?:.|\n)*?\[TABLE_END\]', re.I | re.DOTALL), "Table-wrapped Parts"),
    ]

    for compiled_pattern, description in patterns:
        matches = compiled_pattern.findall(content)
        print(f"\n{description}: {len(matches)} matches\n")
        for i, match in enumerate(matches[:3]):
            clean_match = ' '.join(match.split())[:100]
            print(f"  {i+1}: {clean_match}...\n")

# Run the fixed tests
results_universal = test_universal_detection_fixed()
old_vs_new_sections = compare_old_vs_universal_fixed()
quick_pattern_test_fixed()

INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 0 unique sections:
INFO:__main__:Found table of contents (1367 chars)
INFO:__main__:Extracted 0 sections from table of contents:
INFO:__main__:Found 1 sections in AAPL_10K_2020-10-30.txt
INFO:__main__:Created 172 chunks for AAPL_10K_2020-10-30.txt
INFO:__main__:🔍 Improved detection found 0 potential sections:
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 0 unique sections:
INFO:__main__:Found table of contents (1367 chars)
INFO:__main__:Extracted 0 sections from table of contents:
INFO:__main__:Found 1 sections in AAPL_10K_2020-10-30.txt


🚀 SEC Filing Preprocessing Strategy - Ready for Testing!

Key improvements over original approach:

✅ Multi-strategy section detection with fallbacks

✅ Sentence-aware chunking with overlap

✅ Robust error handling and logging

✅ Structured data classes for better organization

✅ Quality validation and statistics

✅ Separate table and narrative processing

🧪 Testing with: processed_filings/AAPL/AAPL_10K_2020-10-30.txt

📊 Processing Results:

  total_chunks: 172

  avg_tokens: 379.86046511627904

  min_tokens: 38

  max_tokens: 1692

  chunks_with_overlap: 105

  table_chunks: 66

  narrative_chunks: 106

  unique_sections: 1


📝 Sample Chunks:


Chunk 1 (table):

  Section: Full Document

  Tokens: 58

  Text preview: California | 94-2404110 | (State or other jurisdiction | of incorporation or organization) | (I.R.S. Employer Identification No.) | One Apple Park Way | Cupertino | , | California | 95014 | (Address o...


Chunk 2 (table):

  Section: Full Document

  Tokens: 240

  Text 

INFO:__main__:Created 262 chunks for AAPL_10K_2020-10-30.txt
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 0 unique sections:
INFO:__main__:Found table of contents (1367 chars)
INFO:__main__:Extracted 0 sections from table of contents:
INFO:__main__:Found 1 sections in AAPL_10K_2020-10-30.txt
INFO:__main__:Created 172 chunks for AAPL_10K_2020-10-30.txt
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 0 unique sections:
INFO:__main__:Found table of contents (1367 chars)
INFO:__main__:Extracted 0 sections from table of contents:
INFO:__main__:Found 1 sections in AAPL_10K_2020-10-30.txt
INFO:__main__:Created 127 chunks for AAPL_10K_2020-10-30.txt
ERROR:__main__:Error processing non_existent_file.txt: Unknown datetime string format, unable to parse: file, at position 0
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:Empty content provided to detect_sections_

  Total chunks: 262

  Avg tokens: 273.5

  Overlap rate: 195/262


🧪 Testing: Medium chunks, medium overlap

  Total chunks: 172

  Avg tokens: 379.9

  Overlap rate: 105/172


🧪 Testing: Large chunks, high overlap

  Total chunks: 127

  Avg tokens: 495.8

  Overlap rate: 60/127

🛡️ Testing Error Handling

Test 1: Non-existent file

  Result: 0 chunks (expected 0)


Test 2: Empty content

  Result: 1 sections


Test 3: Malformed filename

  Result: 0 chunks (expected 0)


Test 4: Very short text

  Result: 0 chunks

🔄 Testing Batch Processing (max 3 files)

Processing 3 files...

  1/3: AMZN_10Q_2022-04-29.txt

  2/3: AMZN_10Q_2020-05-01.txt



INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 0 unique sections:
INFO:__main__:Found table of contents (901 chars)
INFO:__main__:Extracted 0 sections from table of contents:
INFO:__main__:Found 1 sections in AMZN_10Q_2020-05-01.txt
INFO:__main__:Created 195 chunks for AMZN_10Q_2020-05-01.txt
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 0 unique sections:
INFO:__main__:Found table of contents (901 chars)
INFO:__main__:Extracted 0 sections from table of contents:
INFO:__main__:Found 1 sections in AMZN_10Q_2020-10-30.txt
INFO:__main__:Created 120 chunks for AMZN_10Q_2020-10-30.txt
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 19 unique sections:
INFO:__main__:  1: Item/Part 1 - Business...
INFO:__main__:  2: Item/Part 1A - Risk Factors...
INFO:__main__:  3: Item/Part 1B - Unresolved Staff Comments...
INFO:__main__:  4: 

  3/3: AMZN_10Q_2020-10-30.txt


📊 Batch Processing Summary:

  Total files processed: 3

  Total chunks created: 440

  Average chunks per file: 146.7


📋 Per-file results:

  AMZN_10Q_2022-04-29.txt: 125 chunks, 1 sections, 51 tables

  AMZN_10Q_2020-05-01.txt: 195 chunks, 1 sections, 131 tables

  AMZN_10Q_2020-10-30.txt: 120 chunks, 1 sections, 48 tables

📈 Final Analysis Summary

🎯 Key Insights:

  • Document: AAPL 10K (FY2020)

  • Total chunks: 172

  • Average chunk size: 380 tokens

  • Size range: 38 - 1692 tokens

  • Overlap rate: 61.0%


📊 Chunk Distribution by Type:

  • narrative: 106 chunks (61.6%)

  • table: 66 chunks (38.4%)


📚 Section Breakdown:

  • Full Document: 172 chunks


✅ Quality Metrics:

  • Very small chunks (<50 tokens): 2 (1.2%)

  • Large chunks (>800 tokens): 3 (1.7%)

  • Unique sections identified: 1


🔍 Sample Chunks for Review:


  TABLE example (58 tokens):

    Section: Full Document

    Preview: California | 94-2404110 | (State or other juris

INFO:__main__:Created 172 chunks for AAPL_10K_2020-10-30.txt
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 21 unique sections:
INFO:__main__:  1: Item/Part 1 - Business...
INFO:__main__:  2: Item/Part 1A - Risk Factors...
INFO:__main__:  3: Item/Part 1B - Unresolved Staff Comments...
INFO:__main__:  4: Item/Part 2 - Properties...
INFO:__main__:  5: Item/Part 3 - Legal Proceedings...
INFO:__main__:  6: Item/Part 4 - Mine Safety Disclosures...
INFO:__main__:  7: Item/Part 5 - Market for the Registrant’s Common Stock, Related Shareholde...
INFO:__main__:  8: Item/Part 6 - Reserved...
INFO:__main__:  9: Item/Part 7A - Quantitative and Qualitative Disclosures About Market Risk...
INFO:__main__:  10: Item/Part 8 - Financial Statements and Supplementary Data...
INFO:__main__:  11: Item/Part unknown - Legal Proceedings...
INFO:__main__:  12: Item/Part 9 - Changes in and Disagreements with Accountants On Accounting ...
INFO:__main__:  13:


📊 Processing Results:

  total_chunks: 172

  avg_tokens: 379.86046511627904

  min_tokens: 38

  max_tokens: 1692

  chunks_with_overlap: 105

  table_chunks: 66

  narrative_chunks: 106

  unique_sections: 1


📚 Section Distribution (sample):

  • Full Document: 20 chunks


🧪 Testing: processed_filings/AMZN/AMZN_10K_2023-02-03.txt


✅ Found 21 sections:

  1. Item 1 - BUSINESS

     Type: item, Length: 13,286 chars

  2. Item 1A - RISK FACTORS

     Type: item, Length: 55,961 chars

  3. Item 1B - UNRESOLVED STAFF COMMENTS

     Type: item, Length: 107 chars

  4. Item 2 - PROPERTIES

     Type: item, Length: 1,438 chars

  5. Item 3 - LEGAL PROCEEDINGS

     Type: item, Length: 186 chars

  6. Item 4 - MINE SAFETY DISCLOSURES

     Type: item, Length: 123 chars

  7. Item 5 - MARKET FOR THE REGISTRANT’S COMMON STOCK, RELATED SHAREHOLDER MATTERS, AND ISSUER PURCHASES OF EQUITY SECURITIES

     Type: item, Length: 508 chars

  8. Item 6 - RESERVED

     Type: item, Length: 50,498 cha

INFO:__main__:Found table of contents (1441 chars)
INFO:__main__:Extracted 0 sections from table of contents:
INFO:__main__:Found 1 sections in AMZN_10K_2023-02-03.txt
INFO:__main__:Created 210 chunks for AMZN_10K_2023-02-03.txt
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 11 unique sections:
INFO:__main__:  1: Item/Part 1 - Financial Statements...
INFO:__main__:  2: Item/Part unknown - Legal Proceedings...
INFO:__main__:  3: Item/Part 2 - Management’s Discussion and Analysis of Financial Condition ...
INFO:__main__:  4: Item/Part 3 - Quantitative and Qualitative Disclosures About Market Risk...
INFO:__main__:  5: Item/Part 4 - Controls and Procedures...
INFO:__main__:  6: Item/Part 1 - Legal Proceedings...
INFO:__main__:  7: Item/Part 1A - Risk Factors...
INFO:__main__:  8: Item/Part 2 - Unregistered Sales of Equity Securities and Use of Proceeds...
INFO:__main__:  9: Item/Part 3 - Defaults Upon Senior Securities...
INFO:__main


📊 Processing Results:

  total_chunks: 210

  avg_tokens: 332.1666666666667

  min_tokens: 6

  max_tokens: 1157

  chunks_with_overlap: 119

  table_chunks: 90

  narrative_chunks: 120

  unique_sections: 1


📚 Section Distribution (sample):

  • Full Document: 20 chunks


🧪 Testing: processed_filings/AMZN/AMZN_10Q_2024-11-01.txt


✅ Found 11 sections:

  1. Item 1 - FINANCIAL STATEMENTS

     Type: item, Length: 34,940 chars

  2. Legal Proceedings

     Type: named_section, Length: 32,116 chars

  3. Item 2 - MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIAL CONDITION AND RESULTS OF OPERATIONS

     Type: item, Length: 45,107 chars

  4. Item 3 - QUANTITATIVE AND QUALITATIVE DISCLOSURES ABOUT MARKET RISK

     Type: item, Length: 4,405 chars

  5. Item 4 - CONTROLS AND PROCEDURES

     Type: item, Length: 2,104 chars

  6. Item 1 - LEGAL PROCEEDINGS

     Type: item, Length: 162 chars

  7. Item 1A - RISK FACTORS

     Type: item, Length: 59,433 chars

  8. Item 2 - UNREGISTERED SA

INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 19 unique sections:
INFO:__main__:  1: Item/Part 1 - Business...
INFO:__main__:  2: Item/Part 1A - Risk Factors...
INFO:__main__:  3: Item/Part 1B - Unresolved Staff Comments...
INFO:__main__:  4: Item/Part 3 - Legal Proceedings...
INFO:__main__:  5: Item/Part 4 - Mine Safety Disclosures...
INFO:__main__:  6: Item/Part 5 - Market for Registrant’s Common Equity, Related Stockholder M...
INFO:__main__:  7: Item/Part 6 - Selected Financial Data...
INFO:__main__:  8: Item/Part 7 - Management’s Discussion and Analysis of Financial Condition ...
INFO:__main__:  9: Item/Part 7A - Quantitative and Qualitative Disclosures About Market Risk...
INFO:__main__:  10: Item/Part 8 - Financial Statements and Supplementary Data...
INFO:__main__:  11: Item/Part 9 - Changes in and Disagreements with Accountants on Accounting ...
INFO:__main__:  12: Item/Part 9A - Controls and Procedures...
INFO:__main__:

Running universal detection...


📊 Comparison Results:

  Old detection: 22 sections

  Universal detection: 19 sections

  Improvement: +-3 sections


📋 Old Sections:

  1. PART I

  2. Risk Factors

  3. Unresolved Staff Comments

  4. Legal Proceedings

  5. Mine Safety Disclosures

  6. PART II

  7. Selected Financial Data

  8. Management’s Discussion and Analysis of Financial Condition and Results of Operations

  9. Quantitative and Qualitative Disclosures About Market Risk

  10. Financial Statements and Supplementary Data

  11. Notes to Consolidated Financial Statements

  12. Opinion on the Financial Statements

  13. Changes in and Disagreements with Accountants on Accounting and Financial Disclosure

  14. Controls and Procedures

  15. PART III

  16. Executive Compensation

  17. Security Ownership of Certain Beneficial Owners and Management and Related Stockholder Matters

  18. Certain Relationships and Related Transactions, and Director Independence

  19. Principal 

In [30]:
def detect_sections_strategy_1_improved(content: str) -> List[DocumentSection]:
    """
    Improved Strategy 1: Patterns based on real SEC filing structure
    """
    sections = []

    patterns = [
        re.compile(r'^\s*PART\s+([IVX]+)(?:\s*[-–—].*?)?$', re.I | re.M),
        re.compile(r'^PART\s+([IVX]+)(?:\s*[-–—].*?)?$', re.I | re.M),
        re.compile(r'^\s*ITEM\s+(\d{1,2}[A-C]?)(?:[.\s–—])', re.I | re.M),
        re.compile(r'^ITEM\s+(\d{1,2}[A-C]?)(?:[.\s–—])', re.I | re.M),
        re.compile(r'Item\s+(\d{1,2}[A-C]?)(?:[.\s–—])', re.I | re.M),
        re.compile(r'^(\d{1,2}[A-C]?)\.\s+[A-Z][A-Za-z\s]{10,}', re.I | re.M),
        re.compile(r'^.{0,50}(BUSINESS)\s*$', re.I | re.M),
        re.compile(r'^.{0,50}(RISK FACTORS)\s*$', re.I | re.M),
        re.compile(r'^.{0,50}(LEGAL PROCEEDINGS)\s*$', re.I | re.M),
        re.compile(r'^.{0,50}(FINANCIAL STATEMENTS)\s*$', re.I | re.M),
        re.compile(r'^.{0,50}(MANAGEMENT.S DISCUSSION)\s*', re.I | re.M),
        re.compile(r'^.{0,50}(PROPERTIES)\s*$', re.I | re.M),
        re.compile(r'^.{0,50}(CONTROLS AND PROCEDURES)\s*$', re.I | re.M),
    ]

    all_matches = []

    for pattern_idx, pattern in enumerate(patterns):
        for match in pattern.finditer(content):
            line_start = content.rfind('\n', 0, match.start()) + 1
            line_end = content.find('\n', match.end())
            if line_end == -1:
                line_end = len(content)

            full_line = content[line_start:line_end].strip()

            if (len(full_line) > 400 or
                len(full_line) < 3 or
                ('TABLE' in full_line.upper() and ('START' in full_line.upper() or 'END' in full_line.upper())) or
                full_line.count(' ') > 20):
                continue

            if any(toc_indicator in full_line.lower() for toc_indicator in ['table of contents', 'index']):
                continue
            
            section_id = None
            section_title = full_line

            groups = match.groups()
            if groups:
                potential_id = groups[0].strip()
                is_item_id = re.match(r'^\d+[A-C]?$', potential_id, re.I)
                is_part_id = re.match(r'^[IVX]+$', potential_id, re.I)

                if is_item_id or is_part_id:
                    section_id = potential_id
                    if len(groups) > 1 and groups[1]:
                        section_title = groups[1].strip()
                        section_title = re.sub(r'\[TABLE_END\]\s*.*', '', section_title, flags=re.I).strip()
                        section_title = section_title.replace('|', '').strip()
                    else:
                        remaining_line_after_id = full_line[match.end() - line_start:].strip()
                        clean_line = re.sub(r'^\s*\.?\s*[-–—]?\s*', '', remaining_line_after_id).strip()
                        if clean_line and len(clean_line) < 200:
                            section_title = clean_line
                        else:
                             section_title = full_line
                else:
                    section_title = full_line
                    if 'BUSINESS' in full_line.upper() and not is_item_id and not is_part_id: section_id = '1'
                    elif 'RISK FACTORS' in full_line.upper() and not is_item_id and not is_part_id: section_id = '1A'

            all_matches.append({
                'start_pos': match.start(),
                'end_pos': match.end(),
                'full_line': full_line,
                'section_id': section_id if section_id else 'unknown',
                'section_title': section_title,
                'pattern_idx': pattern_idx,
                'match_start': match.start()
            })

    all_matches.sort(key=lambda x: (x['start_pos'], x['pattern_idx']))

    unique_matches = []
    if all_matches:
        unique_matches.append(all_matches[0])
        for i in range(1, len(all_matches)):
            current_match = all_matches[i]
            last_added_match = unique_matches[-1]

            if current_match['start_pos'] - last_added_match['start_pos'] < 100:
                if current_match['section_id'] != 'unknown' and last_added_match['section_id'] == 'unknown':
                    unique_matches[-1] = current_match
                elif current_match['section_id'] != 'unknown' and last_added_match['section_id'] != 'unknown' and current_match['pattern_idx'] < last_added_match['pattern_idx']:
                    unique_matches[-1] = current_match
                elif current_match['section_id'] == last_added_match['section_id'] and len(current_match['section_title']) < len(last_added_match['section_title']) * 0.8:
                     unique_matches[-1] = current_match
            else:
                unique_matches.append(current_match)

    logger.info(f"🔍 Improved detection found {len(unique_matches)} potential sections:")
    for i, match in enumerate(unique_matches[:15]):
        logger.info(f"  {i+1}: {match['full_line'][:80]}...")

    sections_to_return = []
    current_part = None

    for i, match in enumerate(unique_matches):
        start_pos = match['start_pos']
        end_pos = unique_matches[i + 1]['start_pos'] if i + 1 < len(unique_matches) else len(content)

        section_content = content[start_pos:end_pos].strip()

        # FIX: Guard re.match with a check for section_id being a string
        section_id_str = match['section_id'].upper() if match['section_id'] else '' # Ensure it's an empty string if None

        section_type = 'content'
        item_number = None
        part = None
        title = match['section_title']

        if section_id_str and re.match(r'^[IVX]+$', section_id_str): # Guarded check
            section_type = 'part'
            part = f"PART {section_id_str}"
            current_part = part
            if title.upper().startswith("PART ") and title.upper().replace("PART ", "").strip() == section_id_str:
                title = part
            elif not title:
                title = part
        elif section_id_str and re.match(r'^\d+[A-C]?$', section_id_str): # Guarded check
            section_type = 'item'
            item_number = section_id_str
            part = current_part
            if title.upper().startswith("ITEM ") and title.upper().replace("ITEM ", "").strip() == section_id_str:
                title = f"Item {item_number}"
            elif not title:
                title = f"Item {item_number}"
        elif any(keyword in full_line_upper for keyword in
                ['BUSINESS', 'RISK', 'LEGAL', 'FINANCIAL', 'MANAGEMENT', 'PROPERTIES', 'CONTROLS']):
            section_type = 'named_section'


        sections_to_return.append(DocumentSection(
            title=title,
            content=section_content,
            section_type=section_type,
            item_number=item_number,
            part=part,
            start_pos=start_pos,
            end_pos=end_pos
        ))

    return sections_to_return

In [31]:
# Run the fixed tests
results_universal = test_universal_detection_fixed()
old_vs_new_sections = compare_old_vs_universal_fixed()
quick_pattern_test_fixed()

INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 19 unique sections:
INFO:__main__:  1: Item/Part 1 - Business...
INFO:__main__:  2: Item/Part 1A - Risk Factors...
INFO:__main__:  3: Item/Part 1B - Unresolved Staff Comments...
INFO:__main__:  4: Item/Part 3 - Legal Proceedings...
INFO:__main__:  5: Item/Part 4 - Mine Safety Disclosures...
INFO:__main__:  6: Item/Part 5 - Market for Registrant’s Common Equity, Related Stockholder M...
INFO:__main__:  7: Item/Part 6 - Selected Financial Data...
INFO:__main__:  8: Item/Part 7 - Management’s Discussion and Analysis of Financial Condition ...
INFO:__main__:  9: Item/Part 7A - Quantitative and Qualitative Disclosures About Market Risk...
INFO:__main__:  10: Item/Part 8 - Financial Statements and Supplementary Data...



🧪 Testing: processed_filings/AAPL/AAPL_10K_2020-10-30.txt



INFO:__main__:  11: Item/Part 9 - Changes in and Disagreements with Accountants on Accounting ...
INFO:__main__:  12: Item/Part 9A - Controls and Procedures...
INFO:__main__:  13: Item/Part 9B - Other Information...
INFO:__main__:  14: Item/Part 11 - Executive Compensation...
INFO:__main__:  15: Item/Part 12 - Security Ownership of Certain Beneficial Owners and Manageme...
INFO:__main__:Universal detection successful (Strategy 1): Found 19 sections.
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 0 unique sections:
INFO:__main__:Found table of contents (1367 chars)
INFO:__main__:Extracted 0 sections from table of contents:
INFO:__main__:Found 1 sections in AAPL_10K_2020-10-30.txt



✅ Found 19 sections:

  1. Item 1 - BUSINESS

     Type: item, Length: 13,266 chars

  2. Item 1A - RISK FACTORS

     Type: item, Length: 61,136 chars

  3. Item 1B - UNRESOLVED STAFF COMMENTS

     Type: item, Length: 582 chars

  4. Item 3 - LEGAL PROCEEDINGS

     Type: item, Length: 898 chars

  5. Item 4 - MINE SAFETY DISCLOSURES

     Type: item, Length: 108 chars

  6. Item 5 - MARKET FOR REGISTRANT’S COMMON EQUITY, RELATED STOCKHOLDER MATTERS AND ISSUER PURCHASES OF EQUITY SECURITIES

     Type: item, Length: 4,182 chars

  7. Item 6 - SELECTED FINANCIAL DATA

     Type: item, Length: 1,745 chars

  8. Item 7 - MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIAL CONDITION AND RESULTS OF OPERATIONS

     Type: item, Length: 33,154 chars

  9. Item 7A - QUANTITATIVE AND QUALITATIVE DISCLOSURES ABOUT MARKET RISK

     Type: item, Length: 6,799 chars

  10. Item 8 - FINANCIAL STATEMENTS AND SUPPLEMENTARY DATA

     Type: item, Length: 103,042 chars



INFO:__main__:Created 172 chunks for AAPL_10K_2020-10-30.txt
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 21 unique sections:
INFO:__main__:  1: Item/Part 1 - Business...
INFO:__main__:  2: Item/Part 1A - Risk Factors...
INFO:__main__:  3: Item/Part 1B - Unresolved Staff Comments...
INFO:__main__:  4: Item/Part 2 - Properties...
INFO:__main__:  5: Item/Part 3 - Legal Proceedings...
INFO:__main__:  6: Item/Part 4 - Mine Safety Disclosures...
INFO:__main__:  7: Item/Part 5 - Market for the Registrant’s Common Stock, Related Shareholde...
INFO:__main__:  8: Item/Part 6 - Reserved...
INFO:__main__:  9: Item/Part 7A - Quantitative and Qualitative Disclosures About Market Risk...
INFO:__main__:  10: Item/Part 8 - Financial Statements and Supplementary Data...
INFO:__main__:  11: Item/Part unknown - Legal Proceedings...
INFO:__main__:  12: Item/Part 9 - Changes in and Disagreements with Accountants On Accounting ...
INFO:__main__:  13:


📊 Processing Results:

  total_chunks: 172

  avg_tokens: 379.86046511627904

  min_tokens: 38

  max_tokens: 1692

  chunks_with_overlap: 105

  table_chunks: 66

  narrative_chunks: 106

  unique_sections: 1


📚 Section Distribution (sample):

  • Full Document: 20 chunks


🧪 Testing: processed_filings/AMZN/AMZN_10K_2023-02-03.txt


✅ Found 21 sections:

  1. Item 1 - BUSINESS

     Type: item, Length: 13,286 chars

  2. Item 1A - RISK FACTORS

     Type: item, Length: 55,961 chars

  3. Item 1B - UNRESOLVED STAFF COMMENTS

     Type: item, Length: 107 chars

  4. Item 2 - PROPERTIES

     Type: item, Length: 1,438 chars

  5. Item 3 - LEGAL PROCEEDINGS

     Type: item, Length: 186 chars

  6. Item 4 - MINE SAFETY DISCLOSURES

     Type: item, Length: 123 chars

  7. Item 5 - MARKET FOR THE REGISTRANT’S COMMON STOCK, RELATED SHAREHOLDER MATTERS, AND ISSUER PURCHASES OF EQUITY SECURITIES

     Type: item, Length: 508 chars

  8. Item 6 - RESERVED

     Type: item, Length: 50,498 cha

INFO:__main__:Created 161 chunks for KO_10Q_2020-07-22.txt
INFO:__main__:Attempting Strategy 1: Regex-based section detection
INFO:__main__:🔍 Improved detection found 22 potential sections:
INFO:__main__:  1: PART I...
INFO:__main__:  2: Item 1A.    Risk Factors...
INFO:__main__:  3: Item 1B.    Unresolved Staff Comments...
INFO:__main__:  4: Item 3.    Legal Proceedings...
INFO:__main__:  5: Item 4.    Mine Safety Disclosures...
INFO:__main__:  6: PART II...
INFO:__main__:  7: Item 6.    Selected Financial Data...
INFO:__main__:  8: Item 7.    Management’s Discussion and Analysis of Financial Condition and Resul...
INFO:__main__:  9: Item 7A.    Quantitative and Qualitative Disclosures About Market Risk...
INFO:__main__:  10: Item 8.    Financial Statements and Supplementary Data...
INFO:__main__:  11: Notes to Consolidated Financial Statements...
INFO:__main__:  12: Opinion on the Financial Statements...
INFO:__main__:  13: Item 9.    Changes in and Disagreements with Accountants on 


📊 Processing Results:

  total_chunks: 161

  avg_tokens: 396.7577639751553

  min_tokens: 32

  max_tokens: 1451

  chunks_with_overlap: 97

  table_chunks: 63

  narrative_chunks: 98

  unique_sections: 1


📚 Section Distribution (sample):

  • Full Document: 20 chunks


📊 UNIVERSAL DETECTION SUMMARY

AAPL_10K_2020-10-30.txt   | 19 sections | 172 chunks

AMZN_10K_2023-02-03.txt   | 21 sections | 210 chunks

AMZN_10Q_2024-11-01.txt   | 11 sections | 132 chunks

KO_10Q_2020-07-22.txt     |  8 sections | 161 chunks

⚖️ OLD vs UNIVERSAL Detection Comparison

Running old detection...



NameError: name 'full_line_upper' is not defined

In [33]:
import os
import re
import pandas as pd
import tiktoken
from typing import List, Dict, Any, Tuple, Optional
from dataclasses import dataclass
from datetime import datetime
import logging
from pathlib import Path

# Set up logging to DEBUG level to see detailed process
logging.basicConfig(level=logging.DEBUG) # CRITICAL CHANGE: Set to DEBUG
logger = logging.getLogger(__name__)

# Initialize tokenizer for accurate token counting
encoding = tiktoken.encoding_for_model("text-embedding-3-small")

# =============================================================================
# 1. SEC MAPPINGS WITH FALLBACKS
# =============================================================================

ITEM_NAME_MAP_10K = {
    "1": "Business",
    "1A": "Risk Factors",
    "1B": "Unresolved Staff Comments",
    "1C": "Cybersecurity",
    "2": "Properties",
    "3": "Legal Proceedings",
    "4": "Mine Safety Disclosures",
    "5": "Market for Registrant's Common Equity, Related Stockholder Matters and Issuer Purchases of Equity Securities",
    "6": "Reserved",
    "7": "Management's Discussion and Analysis of Financial Condition and Results of Operations",
    "7A": "Quantitative and Qualitative Disclosures About Market Risk",
    "8": "Financial Statements and Supplementary Data",
    "9": "Changes in and Disagreements With Accountants on Accounting and Financial Disclosure",
    "9A": "Controls and Procedures",
    "9B": "Other Information",
    "9C": "Disclosure Regarding Foreign Jurisdictions that Prevent Inspections",
    "10": "Directors, Executive Officers and Corporate Governance",
    "11": "Executive Compensation",
    "12": "Security Ownership of Certain Beneficial Owners and Management and Related Stockholder Matters",
    "13": "Certain Relationships and Related Transactions, and Director Independence",
    "14": "Principal Accountant Fees and Services",
    "15": "Exhibits, Financial Statement Schedules",
    "16": "Form 10-K Summary"
}

ITEM_NAME_MAP_10Q_PART_I = {
    "1": "Financial Statements",
    "2": "Management's Discussion and Analysis of Financial Condition and Results of Operations",
    "3": "Quantitative and Qualitative Disclosures About Market Risk",
    "4": "Controls and Procedures",
}

ITEM_NAME_MAP_10Q_PART_II = {
    "1": "Legal Proceedings", "1A": "Risk Factors",
    "2": "Unregistered Sales of Equity Securities and Use of Proceeds",
    "3": "Defaults Upon Senior Securities", "4": "Mine Safety Disclosures",
    "5": "Other Information", "6": "Exhibits",
}

# =============================================================================
# 2. DATA STRUCTURES FOR BETTER ORGANIZATION
# =============================================================================

@dataclass
class FilingMetadata:
    """Structured metadata for a filing"""
    ticker: str
    form_type: str
    filing_date: str
    fiscal_year: int
    fiscal_quarter: int
    file_path: str

@dataclass
class DocumentSection:
    """Represents a section of the document"""
    title: str
    content: str
    section_type: str  # 'item', 'part', 'intro', 'table'
    item_number: Optional[str] = None
    part: Optional[str] = None
    start_pos: int = 0
    end_pos: int = 0

@dataclass
class Chunk:
    """Final chunk with all metadata"""
    chunk_id: str
    text: str
    token_count: int
    chunk_type: str  # 'narrative', 'table', 'mixed'
    section_info: str
    filing_metadata: FilingMetadata
    chunk_index: int
    has_overlap: bool = False

# =============================================================================
# 3. ROBUST TEXT CLEANING
# =============================================================================

def clean_sec_text(text: str) -> str:
    """
    Clean SEC filing text more robustly
    """
    text = re.sub(r'UNITED STATES\s+SECURITIES AND EXCHANGE COMMISSION.*?FORM \d+[A-Z]*', '', text, flags=re.DOTALL | re.IGNORECASE)

    text = text.replace('[PAGE BREAK]', '\n\n--- PAGE BREAK ---\n\n')

    text = re.sub(r'\[TABLE_START\]', '\n\n=== TABLE START ===\n', text)
    text = re.sub(r'\[TABLE_END\]', '\n=== TABLE END ===\n\n', text)

    text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text)
    text = re.sub(r'[ \t]+', ' ', text)
    text = re.sub(r'^\s+|\s+$', '', text, flags=re.MULTILINE)

    return text.strip()

# =============================================================================
# 4. MULTI-STRATEGY SECTION DETECTION
# =============================================================================

def detect_sections_strategy_1_improved(content: str) -> List[DocumentSection]:
    """
    Improved Strategy 1: Patterns based on real SEC filing structure
    """
    sections = []

    patterns = [
        re.compile(r'^\s*PART\s+([IVX]+)(?:\s*[-–—].*?)?$', re.I | re.M),
        re.compile(r'^PART\s+([IVX]+)(?:\s*[-–—].*?)?$', re.I | re.M),
        re.compile(r'^\s*ITEM\s+(\d{1,2}[A-C]?)(?:[.\s–—])', re.I | re.M),
        re.compile(r'^ITEM\s+(\d{1,2}[A-C]?)(?:[.\s–—])', re.I | re.M),
        re.compile(r'Item\s+(\d{1,2}[A-C]?)(?:[.\s–—])', re.I | re.M),
        re.compile(r'^(\d{1,2}[A-C]?)\.\s+[A-Z][A-Za-z\s]{10,}', re.I | re.M),
        re.compile(r'^.{0,50}(BUSINESS)\s*$', re.I | re.M),
        re.compile(r'^.{0,50}(RISK FACTORS)\s*$', re.I | re.M),
        re.compile(r'^.{0,50}(LEGAL PROCEEDINGS)\s*$', re.I | re.M),
        re.compile(r'^.{0,50}(FINANCIAL STATEMENTS)\s*$', re.I | re.M),
        re.compile(r'^.{0,50}(MANAGEMENT.S DISCUSSION)\s*', re.I | re.M),
        re.compile(r'^.{0,50}(PROPERTIES)\s*$', re.I | re.M),
        re.compile(r'^.{0,50}(CONTROLS AND PROCEDURES)\s*$', re.I | re.M),
    ]

    all_matches = []

    for pattern_idx, pattern in enumerate(patterns):
        for match in pattern.finditer(content):
            line_start = content.rfind('\n', 0, match.start()) + 1
            line_end = content.find('\n', match.end())
            if line_end == -1:
                line_end = len(content)

            full_line = content[line_start:line_end].strip()

            if (len(full_line) > 400 or
                len(full_line) < 3 or
                ('TABLE' in full_line.upper() and ('START' in full_line.upper() or 'END' in full_line.upper())) or
                full_line.count(' ') > 20):
                continue

            if any(toc_indicator in full_line.lower() for toc_indicator in ['table of contents', 'index']):
                continue
            
            section_id = None
            section_title = full_line

            groups = match.groups()
            if groups:
                potential_id = groups[0].strip()
                is_item_id = re.match(r'^\d+[A-C]?$', potential_id, re.I)
                is_part_id = re.match(r'^[IVX]+$', potential_id, re.I)

                if is_item_id or is_part_id:
                    section_id = potential_id
                    if len(groups) > 1 and groups[1]:
                        section_title = groups[1].strip()
                        section_title = re.sub(r'\[TABLE_END\]\s*.*', '', section_title, flags=re.I).strip()
                        section_title = section_title.replace('|', '').strip()
                    else:
                        remaining_line_after_id = full_line[match.end() - line_start:].strip()
                        clean_line = re.sub(r'^\s*\.?\s*[-–—]?\s*', '', remaining_line_after_id).strip()
                        if clean_line and len(clean_line) < 200:
                            section_title = clean_line
                        else:
                             section_title = full_line
                else:
                    section_title = full_line
                    if 'BUSINESS' in full_line.upper() and not is_item_id and not is_part_id: section_id = '1'
                    elif 'RISK FACTORS' in full_line.upper() and not is_item_id and not is_part_id: section_id = '1A'

            all_matches.append({
                'start_pos': match.start(),
                'end_pos': match.end(),
                'full_line': full_line,
                'section_id': section_id if section_id else 'unknown',
                'section_title': section_title,
                'pattern_idx': pattern_idx,
                'match_start': match.start()
            })

    all_matches.sort(key=lambda x: (x['start_pos'], x['pattern_idx']))

    unique_matches = []
    if all_matches:
        unique_matches.append(all_matches[0])
        for i in range(1, len(all_matches)):
            current_match = all_matches[i]
            last_added_match = unique_matches[-1]

            if current_match['start_pos'] - last_added_match['start_pos'] < 100:
                if current_match['section_id'] != 'unknown' and last_added_match['section_id'] == 'unknown':
                    unique_matches[-1] = current_match
                elif current_match['section_id'] != 'unknown' and last_added_match['section_id'] != 'unknown' and current_match['pattern_idx'] < last_added_match['pattern_idx']:
                    unique_matches[-1] = current_match
                elif current_match['section_id'] == last_added_match['section_id'] and len(current_match['section_title']) < len(last_added_match['section_title']) * 0.8:
                     unique_matches[-1] = current_match
            else:
                unique_matches.append(current_match)

    logger.info(f"🔍 Improved detection found {len(unique_matches)} potential sections:")
    for i, match in enumerate(unique_matches[:15]):
        logger.info(f"  {i+1}: {match['full_line'][:80]}...")

    sections_to_return = []
    current_part = None

    for i, match in enumerate(unique_matches):
        start_pos = match['start_pos']
        end_pos = unique_matches[i + 1]['start_pos'] if i + 1 < len(unique_matches) else len(content)

        section_content = content[start_pos:end_pos].strip()

        # FIX for NameError: full_line_upper needs to be derived from 'match' in this loop's scope
        full_line_upper = match['full_line'].upper()
        # FIX: Ensure section_id is always a string when passed to re.match
        section_id_str = match['section_id'].upper() if match['section_id'] is not None else '' # Use '' instead of None if match['section_id'] is None

        section_type = 'content'
        item_number = None
        part = None
        title = match['section_title']

        if section_id_str and re.match(r'^[IVX]+$', section_id_str):
            section_type = 'part'
            part = f"PART {section_id_str}"
            current_part = part
            if title.upper().startswith("PART ") and title.upper().replace("PART ", "").strip() == section_id_str:
                title = part
            elif not title:
                title = part
        elif section_id_str and re.match(r'^\d+[A-C]?$', section_id_str):
            section_type = 'item'
            item_number = section_id_str
            part = current_part
            if title.upper().startswith("ITEM ") and title.upper().replace("ITEM ", "").strip() == section_id_str:
                title = f"Item {item_number}"
            elif not title:
                title = f"Item {item_number}"
        elif any(keyword in full_line_upper for keyword in # This 'full_line_upper' is now correctly scoped
                ['BUSINESS', 'RISK', 'LEGAL', 'FINANCIAL', 'MANAGEMENT', 'PROPERTIES', 'CONTROLS']):
            section_type = 'named_section'


        sections_to_return.append(DocumentSection(
            title=title,
            content=section_content,
            section_type=section_type,
            item_number=item_number,
            part=part,
            start_pos=start_pos,
            end_pos=end_pos
        ))

    return sections_to_return


def detect_sections_strategy_2(content: str) -> List[DocumentSection]:
    """
    Strategy 2: Fallback using page breaks and heuristics
    """
    sections = []

    pages = content.split('--- PAGE BREAK ---')

    current_section = ""
    current_title = "Document Content"

    for i, page in enumerate(pages):
        page = page.strip()
        if not page:
            continue

        lines = page.split('\n')
        potential_headers = []

        for j, line in enumerate(lines[:10]):
            line = line.strip()
            if (len(line) < 100 and
                (re.search(r'\b(ITEM|PART)\b', line, re.IGNORECASE) or
                 re.search(r'\b(BUSINESS|RISK FACTORS|FINANCIAL STATEMENTS)\b', line, re.IGNORECASE))):
                potential_headers.append((j, line))

        if potential_headers:
            if current_section:
                sections.append(DocumentSection(
                    title=current_title,
                    content=current_section.strip(),
                    section_type='content',
                    start_pos=0,
                    end_pos=len(current_section)
                ))

            current_title = potential_headers[0][1]
            current_section = page
        else:
            current_section += "\n\n" + page

    if current_section:
        sections.append(DocumentSection(
            title=current_title,
            content=current_section.strip(),
            section_type='content',
            start_pos=0,
            end_pos=len(current_section)
        ))

    return sections

def detect_sections_robust_old(content: str) -> List[DocumentSection]:
    """
    Multi-strategy section detection with fallbacks (original version)
    """
    logger.info("Attempting Strategy 1: Regex-based section detection")
    sections = detect_sections_strategy_1_improved(content) # Corrected argument name for DocumentSection constructor

    if len(sections) >= 3:
        logger.info(f"Strategy 1 successful: Found {len(sections)} sections")
        return sections

    logger.warning("Strategy 1 failed, trying Strategy 2: Page-based detection")
    sections = detect_sections_strategy_2(content)

    if len(sections) >= 2:
        logger.info(f"Strategy 2 successful: Found {len(sections)} sections")
        return sections

    logger.warning("All strategies failed, creating single section")
    return [DocumentSection(
        title="Full Document",
        content=content,
        section_type='document',
        start_pos=0,
        end_pos=len(content)
    )]

def create_section_info(section: DocumentSection, form_type: str) -> str:
    """
    Create human-readable section information for DocumentSection objects,
    using form_type to select the correct item name map.
    Handles 10K/10Q specific mappings and part/item inheritance.
    """
    item_number = section.item_number
    section_type = section.section_type
    part_number = section.part

    if section_type == 'item' and item_number:
        if form_type == '10K':
            item_name = ITEM_NAME_MAP_10K.get(item_number, "Unknown Section")
            return f"Item {item_number} - {item_name}"
        elif form_type == '10Q':
            if part_number == 'PART I':
                item_name = ITEM_NAME_MAP_10Q_PART_I.get(item_number, "Unknown Section")
                return f"Part I, Item {item_number} - {item_name}"
            elif part_number == 'PART II':
                item_name = ITEM_NAME_MAP_10Q_PART_II.get(item_number, "Unknown Section")
                return f"Part II, Item {item_number} - {item_name}"
            else: # Fallback if part not explicitly set for 10Q item
                if item_number in ITEM_NAME_MAP_10Q_PART_I:
                    item_name = ITEM_NAME_MAP_10Q_PART_I[item_number]
                    return f"Part I, Item {item_number} - {item_name}"
                elif item_number in ITEM_NAME_MAP_10Q_PART_II:
                    item_name = ITEM_NAME_MAP_10Q_PART_II[item_number]
                    return f"Part II, Item {item_number} - {item_name}"
                return f"Item {item_number} - Unknown 10Q Section"
    
    elif section_type == 'part' and part_number:
        if "Item" in section.title and section.item_number:
            clean_title_suffix = section.title.replace(part_number, '').strip(' -.')
            return f"{part_number} - {clean_title_suffix}"
        return part_number

    return section.title or "Document Content"


def detect_sections_from_toc_universal(content: str) -> List[DocumentSection]:
    """
    Extract sections from table of contents - works for any SEC filing.
    This function primarily identifies section titles and item numbers from TOC,
    but does not extract their content directly.
    """
    sections = []

    if not content:
        logger.info("Empty content provided to detect_sections_from_toc_universal. Returning empty sections.")
        return sections

    toc_patterns = [
        re.compile(r'(?i)INDEX.*?(?=\s*--- PAGE BREAK ---)', re.DOTALL),
        re.compile(r'(?i)TABLE OF CONTENTS.*?(?=\s*--- PAGE BREAK ---)', re.DOTALL),
        re.compile(r'(?i)FORM 10-[KQ].*?INDEX.*?(?=\s*--- PAGE BREAK ---)', re.DOTALL),
        re.compile(re.escape('[TABLE_START]') + r'.*?Page.*?' + re.escape('[TABLE_END]') + r'.*?(?=\s*--- PAGE BREAK ---)', re.DOTALL),
    ]

    toc_content = ""
    for pattern in toc_patterns:
        match = pattern.search(content)
        if match:
            toc_content = match.group(0)
            break

    if not toc_content:
        logger.warning("No table of contents found in detect_sections_from_toc_universal.")
        return sections

    logger.info(f"Found table of contents ({len(toc_content)} chars)")

    item_patterns = [
        # Pattern 1: Multi-column TOC entry with PART, Item, and Title (e.g., KO 10-Q)
        # Group 1: Optional Page Num | Part ID (Group 2) | Part Title (Group 3) | Item ID (Group 4) | Item Title (Group 5)
        re.compile(r'(?i)(?:Page\s*\|\s*)?\s*(PART\s*([IVX]+)\.?(?:\s*([^\n|]+?))?\s*\|\s*)?Item\s*(\d{1,2}[A-C]?)\.?\s*\|\s*([^|]+?)(?:\s*\|\s*\d+)?', re.M),
        
        # Pattern 2: Simpler Item/Part line with Title, pipe-separated. Catches "Item 1. | Financial Statements | 3"
        # Group 1: Item/PART ID, Group 2: Title
        re.compile(r'(?i)(?:Item|PART)\s*(\d{1,2}[A-C]?|[IVX]+)\.?\s*\|\s*([^\n|]+?)(?:\s*\|\s*\d+)?', re.M),
        
        # Pattern 3: Standalone Item/Part line with Title (no pipes separating title)
        # Group 1: Item/PART ID, Group 2: Title
        re.compile(r'(?i)^\s*(?:Item|PART)\s*(\d{1,2}[A-C]?|[IVX]+)\.?\s*([^\n|]+)', re.M),
        
        # Pattern 4: Generic TOC titles, often sub-sections or long descriptions.
        # Group 1: Title
        re.compile(r'^\s*([A-Z][A-Za-z0-9\s\',&\(\)\-\.]{15,})\s*(?:\|\s*\d+)?$', re.M),
        
        # Pattern 5: Simple "PART X" line
        # Group 1: PART ID
        re.compile(r'(?i)^\s*PART\s*([IVX]+)\s*$', re.M),
        
        # Pattern 6: Number-dot format (e.g., "1. Business") usually at start of line
        # Group 1: Item ID, Group 2: Title
        re.compile(r'^\s*(\d{1,2}[A-C]?)\.\s*([^\n|]+)', re.M),
    ]

    found_items = []
    current_part_id_context = None

    if toc_content:
        for line in toc_content.split('\n'):
            line = line.strip()
            if not line:
                continue
            
            if any(kw in line.lower() for kw in ['page', 'signatures', 'exhibit', 'index', 'table of contents']) and len(line) < 30:
                continue
            if re.match(r'^\s*\d+\s*$', line.strip()):
                continue
            if re.match(r'^\s*(\d{1,2}[A-C]?)\s*$', line.strip()):
                continue
            if len(line) < 5:
                continue
            if re.search(r'\d+\s*$', line.strip()) and not re.match(r'(?:Item|PART)\s*(\d{1,2}[A-C]?|[IVX]+)\.?', line, re.I):
                continue


            for pattern in item_patterns:
                match = pattern.search(line)
                if match:
                    item_id = None
                    item_title = ""
                    section_type_raw = 'unknown'

                    if pattern == item_patterns[0]: # Pattern 1: Complex multi-column TOC
                        part_id_cand = match.group(2) if len(match.groups()) >= 2 and match.group(2) else None
                        part_title_from_group = match.group(3) if len(match.groups()) >= 3 and match.group(3) else None
                        item_id = match.group(4).strip() if len(match.groups()) >= 4 and match.group(4) else None
                        item_title = match.group(5).strip() if len(match.groups()) >= 5 and match.group(5) else ""
                        
                        if part_id_cand:
                            current_part_id_context = f"PART {part_id_cand.strip()}"
                            title_for_part = part_title_from_group.strip() if part_title_from_group else f"PART {part_id_cand.strip()}"
                            found_items.append((part_id_cand.strip(), title_for_part, 'part', current_part_id_context))
                        
                        if item_id:
                            section_type_raw = 'item'
                            title_for_item = item_title.strip() if item_title else f"Item {item_id.strip()}"
                            found_items.append((item_id.strip(), title_for_item, section_type_raw, current_part_id_context))
                            break

                    elif pattern in [item_patterns[1], item_patterns[2], item_patterns[5]]: # Patterns with ID as group 1, Title as group 2 (or inferred from line)
                        item_id = match.group(1).strip() if match.group(1) else None
                        item_title = match.group(2).strip() if len(match.groups()) > 1 and match.group(2) else ""

                        is_item = re.match(r'^\d+[A-C]?$', item_id, re.I)
                        is_part = re.match(r'^[IVX]+$', item_id, re.I)

                        if is_item:
                            section_type_raw = 'item'
                            found_items.append((item_id, item_title, section_type_raw, current_part_id_context))
                            break
                        elif is_part:
                            section_type_raw = 'part'
                            current_part_id_context = f"PART {item_id}"
                            found_items.append((item_id, item_title, section_type_raw, current_part_id_context))
                            break
                    
                    elif pattern == item_patterns[3]: # Generic titles (Pattern 4: e.g., "Consolidated Statements of Cash Flows")
                        item_title = match.group(1).strip()
                        if item_title and len(item_title) > 10 and not re.match(r'^\d+(\.\d+)?$', item_title.replace('.', '').strip()):
                             found_items.append((None, item_title, 'named_section', current_part_id_context))
                             break
                    
                    elif pattern == item_patterns[4]: # Simple "PART X" line (Pattern 5)
                        item_id = match.group(1).strip()
                        current_part_id_context = f"PART {item_id}"
                        found_items.append((item_id, f"PART {item_id}", 'part', current_part_id_context))
                        break

    unique_items = []
    seen_keys = set()
    
    processed_items_for_dedup = []
    for item_data in found_items:
        item_id, title_raw, section_type_raw, part_context = item_data
        
        cleaned_title = re.sub(r'\|\s*\d+\s*$', '', title_raw).strip()
        cleaned_title = re.sub(r'\s*\.\s*$', '', cleaned_title).strip()
        cleaned_title = re.sub(r'\[TABLE_END\]\s*.*', '', cleaned_title, flags=re.I).strip()
        cleaned_title = re.sub(r'\s+', ' ', cleaned_title).strip()
        
        if not cleaned_title or len(cleaned_title) < 5 or re.match(r'^\d+(\.\d+)?$', cleaned_title):
            continue

        processed_items_for_dedup.append({
            'item_id': item_id,
            'title': cleaned_title,
            'type': section_type_raw,
            'part': part_context
        })

    processed_items_for_dedup.sort(key=lambda x: (x['part'] if x['part'] else '', x['item_id'] if x['item_id'] else '', x['title']))

    for item in processed_items_for_dedup:
        key = (item['item_id'], item['title'], item['type'], item['part'])
        if key not in seen_keys:
            unique_items.append(DocumentSection(
                title=item['title'],
                content="",
                section_type=item['type'],
                item_number=item['item_id'] if item['type'] == 'item' else None,
                part=item['part'],
                start_pos=0,
                end_pos=0
            ))
            seen_keys.add(key)
    
    logger.info(f"Extracted {len(unique_items)} sections from table of contents:")
    for i, sec in enumerate(unique_items[:15]):
        logger.info(f"  • ID: {sec.item_number if sec.item_number else sec.part if sec.part else 'None'}, Type: {sec.section_type}, Title: {sec.title[:60]}...")

    return unique_items


def detect_sections_robust_universal(content: str) -> List[DocumentSection]:
    """
    Universal robust section detection for all SEC filings.
    Prioritizes direct pattern matching (which handles tables well), then TOC, then page-based.
    """
    logger.info("Attempting universal SEC section detection")

    sections_strategy1 = detect_sections_universal_sec(content)

    if len(sections_strategy1) >= 3:
        logger.info(f"Universal detection successful (Strategy 1): Found {len(sections_strategy1)} sections.")
        return sections_strategy1

    logger.warning("Direct detection found few sections, analyzing table of contents.")
    toc_entries = detect_sections_from_toc_universal(content)

    if toc_entries and len(toc_entries) >= 3:
        logger.info(f"TOC analysis found {len(toc_entries)} potential sections. Attempting to extract content based on TOC titles.")

        combined_sections = []
        current_content_pos = 0

        for i, toc_entry in enumerate(toc_entries):
            pattern_parts = []
            
            if toc_entry.item_number:
                pattern_parts.append(r'Item\s*' + re.escape(toc_entry.item_number) + r'\.?')
            if toc_entry.part and toc_entry.part.startswith("PART "):
                pattern_parts.append(r'PART\s*' + re.escape(toc_entry.part.replace("PART ", "")) + r'\.?')
            
            if toc_entry.title:
                cleaned_title_for_regex = re.sub(r'\|\s*\d+', '', toc_entry.title).strip()
                cleaned_title_for_regex = re.sub(r'\s*\.\s*$', '', cleaned_title_for_regex).strip()
                cleaned_title_for_regex = re.sub(r'\s+-\s+', r'\s*[-–—]?\s*', cleaned_title_for_regex)
                cleaned_title_for_regex = re.sub(r'\s+', r'\s+', cleaned_title_for_regex)
                
                if len(cleaned_title_for_regex) > 5:
                    pattern_parts.append(r'\b?' + re.escape(cleaned_title_for_regex) + r'\b?')
                else:
                    pattern_parts.append(re.escape(cleaned_title_for_regex))
                
            if not pattern_parts:
                logger.warning(f"No valid pattern parts for TOC entry: '{toc_entry.title}'. Skipping.")
                continue

            search_pattern = re.compile(r'(?i)^\s*(?:' + '|'.join(pattern_parts) + r')', re.M)
            
            match = search_pattern.search(content, pos=current_content_pos)

            if match:
                start_pos = match.start()
                
                next_start_pos = len(content)
                if i + 1 < len(toc_entries):
                    next_toc_entry = toc_entries[i+1]
                    next_pattern_parts = []
                    if next_toc_entry.item_number:
                        next_pattern_parts.append(r'Item\s*' + re.escape(next_toc_entry.item_number) + r'\.?')
                    elif next_toc_entry.part and next_toc_entry.part.startswith("PART "):
                        next_pattern_parts.append(r'PART\s*' + re.escape(next_toc_entry.part.replace("PART ", "")) + r'\.?')
                    if next_toc_entry.title:
                        next_cleaned_title_for_regex = re.sub(r'\|\s*\d+', '', next_toc_entry.title).strip()
                        next_cleaned_title_for_regex = re.sub(r'\s*\.\s*$', '', next_cleaned_title_for_regex).strip()
                        next_cleaned_title_for_regex = re.sub(r'\s+-\s+', r'\s*[-–—]?\s*', next_cleaned_title_for_regex)
                        next_cleaned_title_for_regex = re.sub(r'\s+', r'\s+', next_cleaned_title_for_regex)
                        if len(next_cleaned_title_for_regex) > 5:
                            next_pattern_parts.append(r'\b?' + re.escape(next_cleaned_title_for_regex) + r'\b?')
                        else:
                            next_pattern_parts.append(re.escape(next_cleaned_title_for_regex))

                    if next_pattern_parts:
                        next_pattern = re.compile(r'(?i)^\s*(?:' + '|'.join(next_pattern_parts) + r')', re.M)
                        next_match = next_pattern.search(content, pos=match.end())
                        if next_match:
                            next_start_pos = next_match.start()
                
                section_content = content[start_pos:next_start_pos].strip()
                
                combined_sections.append(DocumentSection(
                    title=toc_entry.title,
                    content=section_content,
                    section_type=toc_entry.section_type,
                    item_number=toc_entry.item_number,
                    part=toc_entry.part,
                    start_pos=start_pos,
                    end_pos=next_start_pos
                ))
                current_content_pos = next_start_pos
            else:
                logger.warning(f"Could not find content for TOC entry: '{toc_entry.title}'. This section might be merged with previous or skipped.")

        if len(combined_sections) >= 3:
            logger.info(f"Universal detection successful (TOC-based content mapping): Found {len(combined_sections)} sections.")
            return combined_sections
        else:
            logger.warning("TOC-based content mapping yielded few sections. Falling back to page-based detection.")


    logger.warning("Trying page-based detection as fallback.")
    sections_strategy2 = detect_sections_strategy_2(content)

    if len(sections_strategy2) >= 2:
        logger.info(f"Page-based detection successful: Found {len(sections_strategy2)} sections.")
        return sections_strategy2

    logger.warning("All strategies failed, creating single section.")
    return [DocumentSection(
        title="Full Document",
        content=content,
        section_type='document',
        start_pos=0,
        end_pos=len(content)
    )]

# =============================================================================
# MAIN PROCESSING FUNCTION (Universal)
# =============================================================================
def process_filing_robust_universal(file_path: str, target_tokens: int = 500, overlap_tokens: int = 100) -> List[Chunk]:
    """
    Universal processing function for all SEC filings
    """
    try:
        filing_metadata = extract_metadata_from_filename(file_path)
        filename = Path(file_path).name
        file_id = filename.replace(".txt", "")

        with open(file_path, 'r', encoding='utf-8') as f:
            raw_content = f.read()
        cleaned_content = clean_sec_text(raw_content)

        if not cleaned_content.strip():
            logger.warning(f"Cleaned content for {filename} is empty. No chunks created.")
            return []

        sections = detect_sections_robust_universal(cleaned_content)
        logger.info(f"Found {len(sections)} sections in {filename}")

        all_chunks = []
        chunk_counter = 0

        for section in sections:
            # DEBUG: Log content length of incoming section
            logger.debug(f"Processing section: '{section.title}', Content len: {len(section.content)}, Start: {section.start_pos}, End: {section.end_pos}")

            if not section.content.strip():
                continue

            tables_in_section, narrative_content_in_section = extract_and_process_tables(section.content)

            section_info = create_section_info(section, filing_metadata.form_type)

            for table in tables_in_section:
                chunk = Chunk(
                    chunk_id=f"{file_id}-chunk-{chunk_counter:04d}",
                    text=table['text'],
                    token_count=table['token_count'],
                    chunk_type='table',
                    section_info=section_info,
                    filing_metadata=filing_metadata,
                    chunk_index=chunk_counter,
                    has_overlap=False
                )
                all_chunks.append(chunk)
                chunk_counter += 1

            if narrative_content_in_section.strip():
                narrative_sub_chunks = create_overlapping_chunks(
                    narrative_content_in_section, target_tokens, overlap_tokens
                )

                for chunk_data in narrative_sub_chunks:
                    chunk = Chunk(
                        chunk_id=f"{file_id}-chunk-{chunk_counter:04d}",
                        text=chunk_data['text'],
                        token_count=chunk_data['token_count'],
                        chunk_type='narrative',
                        section_info=section_info,
                        filing_metadata=filing_metadata,
                        chunk_index=chunk_counter,
                        has_overlap=chunk_data['has_overlap']
                    )
                    all_chunks.append(chunk)
                    chunk_counter += 1

        logger.info(f"Created {len(all_chunks)} chunks for {filename}")
        return all_chunks

    except Exception as e:
        logger.error(f"Error processing {file_path}: {e}")
        return []

# =============================================================================
# 5. IMPROVED SENTENCE-AWARE CHUNKING
# =============================================================================

def split_into_sentences(text: str) -> List[str]:
    """
    Split text into sentences using multiple heuristics
    """
    sentences = re.split(r'(?<=[.!?])\s+(?=[A-Z])', text)

    sentences = [s.strip() for s in sentences if s.strip()]

    return sentences

def create_overlapping_chunks(text: str, target_tokens: int = 500, overlap_tokens: int = 100,
                            min_tokens: int = 50) -> List[Dict[str, Any]]:
    """
    Create semantically aware chunks with overlap
    """
    sentences = split_into_sentences(text)
    chunks = []

    current_chunk_sentences = []
    current_tokens = 0

    for i, sentence in enumerate(sentences):
        sentence_tokens = len(encoding.encode(sentence))

        if current_tokens + sentence_tokens > target_tokens and current_chunk_sentences:
            chunk_text = ' '.join(current_chunk_sentences)
            chunks.append({
                'text': chunk_text,
                'token_count': current_tokens,
                'sentence_count': len(current_chunk_sentences),
                'has_overlap': len(chunks) > 0
            })

            overlap_sentences = []
            current_overlap_tokens = 0

            for sent_idx in range(len(current_chunk_sentences) - 1, -1, -1):
                sent = current_chunk_sentences[sent_idx]
                sent_tokens = len(encoding.encode(sent))
                if current_overlap_tokens + sent_tokens <= overlap_tokens:
                    overlap_sentences.insert(0, sent)
                    current_overlap_tokens += sent_tokens
                else:
                    break
            
            if not overlap_sentences and current_chunk_sentences:
                overlap_sentences = [current_chunk_sentences[-1]]
                current_overlap_tokens = len(encoding.encode(overlap_sentences[0]))


            current_chunk_sentences = overlap_sentences + [sentence]
            current_tokens = current_overlap_tokens + sentence_tokens
        else:
            current_chunk_sentences.append(sentence)
            current_tokens += sentence_tokens

    if current_chunk_sentences:
        chunk_text = ' '.join(current_chunk_sentences)
        final_tokens = len(encoding.encode(chunk_text))

        if final_tokens >= min_tokens:
            chunks.append({
                'text': chunk_text,
                'token_count': final_tokens,
                'sentence_count': len(current_chunk_sentences),
                'has_overlap': len(chunks) > 0
            })

    return chunks

# =============================================================================
# 6. TABLE HANDLING
# =============================================================================

def extract_and_process_tables(content: str) -> Tuple[List[Dict], str]:
    """
    Extract tables and return both table chunks and narrative text
    """
    table_pattern = re.compile(r'=== TABLE START ===.*?=== TABLE END ===', re.DOTALL)
    tables = []

    for i, match in enumerate(table_pattern.finditer(content)):
        table_content = match.group(0)
        table_text = table_content.replace('=== TABLE START ===', '').replace('=== TABLE END ===', '').strip()

        if table_text:
            tables.append({
                'text': table_text,
                'token_count': len(encoding.encode(table_text)),
                'table_index': i,
                'chunk_type': 'table'
            })

    narrative_content = table_pattern.sub('', content).strip()

    return tables, narrative_content

# =============================================================================
# 8. TESTING AND VALIDATION
# =============================================================================

def validate_chunks(chunks: List[Chunk]) -> Dict[str, Any]:
    """
    Validate the quality of our chunks
    """
    if not chunks:
        return {"error": "No chunks created"}

    token_counts = [chunk.token_count for chunk in chunks]

    stats = {
        "total_chunks": len(chunks),
        "avg_tokens": sum(token_counts) / len(token_counts),
        "min_tokens": min(token_counts),
        "max_tokens": max(token_counts),
        "chunks_with_overlap": sum(1 for chunk in chunks if chunk.has_overlap),
        "table_chunks": sum(1 for chunk in chunks if chunk.chunk_type == 'table'),
        "narrative_chunks": sum(1 for chunk in chunks if chunk.chunk_type == 'narrative'),
        "unique_sections": len(set(chunk.section_info for chunk in chunks))
    }

    return stats

# =============================================================================
# 9. LET'S TEST THIS!
# =============================================================================

print("🚀 SEC Filing Preprocessing Strategy - Ready for Testing!\n")
print("="*60)
print("Key improvements over original approach:\n")
print("✅ Multi-strategy section detection with fallbacks\n")
print("✅ Sentence-aware chunking with overlap\n")
print("✅ Robust error handling and logging\n")
print("✅ Structured data classes for better organization\n")
print("✅ Quality validation and statistics\n")
print("✅ Separate table and narrative processing\n")
print("="*60)


def test_single_file():
    """Test our preprocessing on a single file"""
    test_file = "processed_filings/AAPL/AAPL_10K_2020-10-30.txt"

    if os.path.exists(test_file):
        print(f"🧪 Testing with: {test_file}\n")
        print("="*50)

        chunks = process_filing_robust_universal(test_file)
        stats = validate_chunks(chunks)

        print("📊 Processing Results:\n")
        for key, value in stats.items():
            print(f"  {key}: {value}\n")

        print("\n📝 Sample Chunks:\n")
        for i, chunk in enumerate(chunks[:3]):
            print(f"\nChunk {i+1} ({chunk.chunk_type}):\n")
            print(f"  Section: {chunk.section_info}\n")
            print(f"  Tokens: {chunk.token_count}\n")
            print(f"  Text preview: {chunk.text[:200]}...\n")

        return chunks
    else:
        print(f"❌ File not found: {test_file}\n")
        print("Please update the file path to match your data structure\n")
        return []

chunks = test_single_file()

def compare_section_strategies(content: str):
    """Compare how different strategies perform"""
    print("🔍 Comparing Section Detection Strategies\n")
    print("="*50)

    sections_1 = detect_sections_strategy_1_improved(content)
    print(f"Strategy 1 (Regex): {len(sections_1)} sections\n")
    for i, section in enumerate(sections_1[:5]):
        print(f"  {i+1}. {section.title[:60]}...\n")

    print()

    sections_2 = detect_sections_strategy_2(content)
    print(f"Strategy 2 (Page-based): {len(sections_2)} sections\n")
    for i, section in enumerate(sections_2[:5]):
        print(f"  {i+1}. {section.title[:60]}...\n")

    return sections_1, sections_2

if chunks:
    test_file = chunks[0].filing_metadata.file_path
    with open(test_file, 'r', encoding='utf-8') as f:
        full_content_for_comparison = f.read()
    cleaned_content_for_comparison = clean_sec_text(full_content_for_comparison)

    sections_1_comp, sections_2_comp = compare_section_strategies(cleaned_content_for_comparison)


def analyze_chunking_quality(chunks: List[Chunk]):
    """Deep dive into chunk quality"""
    if not chunks:
        print("No chunks to analyze\n")
        return

    print("📊 Chunking Quality Analysis\n")
    print("="*50)

    token_counts = [chunk.token_count for chunk in chunks]

    print(f"Token Distribution:\n")
    print(f"  Mean: {sum(token_counts)/len(token_counts):.1f}\n")
    print(f"  Median: {sorted(token_counts)[len(token_counts)//2]}\n")
    print(f"  Min: {min(token_counts)}\n")
    print(f"  Max: {max(token_counts)}\n")

    print(f"\nChunk Types:\n")
    chunk_types = {}
    for chunk in chunks:
        chunk_types[chunk.chunk_type] = chunk_types.get(chunk.chunk_type, 0) + 1
    for chunk_type, count in chunk_types.items():
        print(f"  {chunk_type}: {count}\n")

    print(f"\nSection Distribution:\n")
    sections_dist = {}
    for chunk in chunks:
        sections_dist[chunk.section_info] = sections_dist.get(chunk.section_info, 0) + 1
    for section, count in sorted(sections_dist.items()):
        print(f"  {section}: {count} chunks\n")

    overlap_count = sum(1 for chunk in chunks if chunk.has_overlap)
    print(f"\nOverlap Analysis:\n")
    print(f"  Chunks with overlap: {overlap_count}/{len(chunks)} ({overlap_count/len(chunks)*100:.1f}%)\n")

    return {
        'token_stats': {
            'mean': sum(token_counts)/len(token_counts),
            'median': sorted(token_counts)[len(token_counts)//2],
            'min': min(token_counts),
            'max': max(token_counts)
        },
        'chunk_types': chunk_types,
        'sections': sections_dist,
        'overlap_rate': overlap_count/len(chunks)
    }

if chunks:
    quality_analysis = analyze_chunking_quality(chunks)


def test_chunking_parameters():
    """Test different parameter combinations"""
    if not chunks:
        print("No test file processed yet\n")
        return

    test_file = chunks[0].filing_metadata.file_path

    print("🔧 Testing Different Chunking Parameters\n")
    print("="*50)

    param_configs = [
        {"target_tokens": 300, "overlap_tokens": 50, "name": "Small chunks, low overlap"},
        {"target_tokens": 500, "overlap_tokens": 100, "name": "Medium chunks, medium overlap"},
        {"target_tokens": 800, "overlap_tokens": 150, "name": "Large chunks, high overlap"},
    ]

    results = {}

    for config in param_configs:
        print(f"\n🧪 Testing: {config['name']}\n")
        test_chunks = process_filing_robust_universal(
            test_file,
            target_tokens=config['target_tokens'],
            overlap_tokens=config['overlap_tokens']
        )

        stats = validate_chunks(test_chunks)
        results[config['name']] = stats

        print(f"  Total chunks: {stats['total_chunks']}\n")
        print(f"  Avg tokens: {stats['avg_tokens']:.1f}\n")
        print(f"  Overlap rate: {stats['chunks_with_overlap']}/{stats['total_chunks']}\n")

    return results

param_results = test_chunking_parameters()


def test_error_handling():
    """Test how our system handles various edge cases"""
    print("🛡️ Testing Error Handling\n")
    print("="*50)

    print("Test 1: Non-existent file\n")
    fake_chunks = process_filing_robust_universal("non_existent_file.txt")
    print(f"  Result: {len(fake_chunks)} chunks (expected 0)\n")

    print("\nTest 2: Empty content\n")
    empty_sections = detect_sections_robust_universal("")
    print(f"  Result: {len(empty_sections)} sections\n")

    print("\nTest 3: Malformed filename\n")
    import tempfile
    with tempfile.NamedTemporaryFile(mode='w', suffix='_bad_name.txt', delete=False) as f:
        f.write("Some content")
        temp_file = f.name

    bad_chunks = process_filing_robust_universal(temp_file)
    print(f"  Result: {len(bad_chunks)} chunks (expected 0)\n")

    os.unlink(temp_file)

    print("\nTest 4: Very short text\n")
    short_chunks = create_overlapping_chunks("Short text.", target_tokens=500)
    print(f"  Result: {len(short_chunks)} chunks\n")

test_error_handling()


def test_batch_processing(max_files: int = 5):
    """Test processing multiple files"""
    print(f"🔄 Testing Batch Processing (max {max_files} files)\n")
    print("="*50)

    data_path = "processed_filings/"
    if not os.path.exists(data_path):
        print(f"❌ Data path not found: {data_path}\n")
        return []

    all_files = []
    for root, dirs, files in os.walk(data_path):
        for file in files:
            if file.endswith('.txt'):
                all_files.append(os.path.join(root, file))

    test_files = all_files[:max_files]
    print(f"Processing {len(test_files)} files...\n")

    all_results = []

    for i, file_path in enumerate(test_files):
        print(f"  {i+1}/{len(test_files)}: {os.path.basename(file_path)}\n")

        file_chunks = process_filing_robust_universal(file_path)
        stats = validate_chunks(file_chunks)

        all_results.append({
            'file': os.path.basename(file_path),
            'chunks': len(file_chunks),
            'avg_tokens': stats.get('avg_tokens', 0),
            'sections': stats.get('unique_sections', 0),
            'tables': stats.get('table_chunks', 0)
        })

    print(f"\n📊 Batch Processing Summary:\n")
    total_chunks = sum(r['chunks'] for r in all_results)
    avg_chunks_per_file = total_chunks / len(all_results) if all_results else 0

    print(f"  Total files processed: {len(all_results)}\n")
    print(f"  Total chunks created: {total_chunks}\n")
    print(f"  Average chunks per file: {avg_chunks_per_file:.1f}\n")

    print(f"\n📋 Per-file results:\n")
    for result in all_results:
        print(f"  {result['file']}: {result['chunks']} chunks, {result['sections']} sections, {result['tables']} tables\n")

    return all_results

batch_results = test_batch_processing(max_files=3)


def create_analysis_summary():
    """Create a comprehensive summary of our preprocessing"""
    print("📈 Final Analysis Summary\n")
    print("="*60)

    if 'chunks' not in globals() or not chunks:
        print("No chunks to analyze - run test_single_file() first\n")
        return

    chunk_data = []
    for chunk in chunks:
        chunk_data.append({
            'chunk_id': chunk.chunk_id,
            'tokens': chunk.token_count,
            'type': chunk.chunk_type,
            'section': chunk.section_info,
            'has_overlap': chunk.has_overlap,
            'ticker': chunk.filing_metadata.ticker,
            'form_type': chunk.filing_metadata.form_type,
            'fiscal_year': chunk.filing_metadata.fiscal_year
        })

    df = pd.DataFrame(chunk_data)

    print("🎯 Key Insights:\n")
    print(f"  • Document: {df['ticker'].iloc[0]} {df['form_type'].iloc[0]} (FY{df['fiscal_year'].iloc[0]})\n")
    print(f"  • Total chunks: {len(df)}\n")
    print(f"  • Average chunk size: {df['tokens'].mean():.0f} tokens\n")
    print(f"  • Size range: {df['tokens'].min()} - {df['tokens'].max()} tokens\n")
    print(f"  • Overlap rate: {(df['has_overlap'].sum() / len(df) * 100):.1f}%\n")

    print(f"\n📊 Chunk Distribution by Type:\n")
    type_dist = df['type'].value_counts()
    for chunk_type, count in type_dist.items():
        percentage = (count / len(df)) * 100
        print(f"  • {chunk_type}: {count} chunks ({percentage:.1f}%)\n")

    print(f"\n📚 Section Breakdown:\n")
    section_dist = df['section'].value_counts()
    for section, count in section_dist.head(8).items():
        print(f"  • {section}: {count} chunks\n")

    print(f"\n✅ Quality Metrics:\n")
    small_chunks = df[df['tokens'] < 50]
    print(f"  • Very small chunks (<50 tokens): {len(small_chunks)} ({len(small_chunks)/len(df)*100:.1f}%)\n")

    large_chunks = df[df['tokens'] > 800]
    print(f"  • Large chunks (>800 tokens): {len(large_chunks)} ({len(large_chunks)/len(df)*100:.1f}%)\n")

    unique_sections = df['section'].nunique()
    print(f"  • Unique sections identified: {unique_sections}\n")

    print(f"\n🔍 Sample Chunks for Review:\n")
    for chunk_type in df['type'].unique():
        sample = df[df['type'] == chunk_type].iloc[0]
        chunk_obj = next(c for c in chunks if c.chunk_id == sample['chunk_id'])
        print(f"\n  {chunk_type.upper()} example ({sample['tokens']} tokens):\n")
        print(f"    Section: {sample['section']}\n")
        print(f"    Preview: {chunk_obj.text[:150]}...\n")

    return df

summary_df = create_analysis_summary()


def compare_with_original():
    """Compare our approach with the original chunking strategy"""
    print("⚖️ Comparison: New vs Original Approach\n")
    print("="*60)

    improvements = [
        "✅ Multi-strategy section detection (fallbacks for robustness)",
        "✅ Sentence-aware chunking (preserves semantic boundaries)",
        "✅ Overlapping chunks (maintains context across boundaries)",
        "✅ Separate table processing (handles structured data better)",
        "✅ Comprehensive error handling (graceful degradation)",
        "✅ Rich metadata structure (better for search/filtering)",
        "✅ Quality validation (ensures chunk coherence)",
        "✅ Configurable parameters (tunable for different use cases)"
    ]

    potential_tradeoffs = [
        "⚠️ Slightly more complex code (but more maintainable)",
        "⚠️ More chunks due to overlap (but better retrieval)",
        "⚠️ Processing takes longer (but more robust results)"
    ]

    print("🚀 Key Improvements:\n")
    for improvement in improvements:
        print(f"  {improvement}\n")

    print(f"\n⚖️ Potential Tradeoffs:\n")
    for tradeoff in potential_tradeoffs:
        print(f"  {tradeoff}\n")

    print(f"\n🎯 Recommended Next Steps:\n")
    next_steps = [
        "1. Test on more diverse filings to validate robustness",
        "2. Fine-tune chunking parameters based on embedding performance",
        "3. Add semantic similarity checks between overlapping chunks",
        "4. Implement incremental processing for large datasets",
        "5. Add support for other SEC forms (8-K, DEF 14A, etc.)",
        "6. Create embedding quality metrics and evaluation"
    ]

    for step in next_steps:
        print(f"  {step}\n")

    print("\n" + "="*60)
    print("🎉 Preprocessing Strategy Testing Complete!\n")
    print("="*60)
    print("Next step: Convert this notebook into modular Python files\n")
    print("Then: Implement the embedding pipeline and MCP server!\n")
    print("="*60)

compare_with_original()

print("🚀 Ready to test universal SEC detection!\n")
print("\n1. Run test_universal_detection_fixed() to test all files\n")
print("2. Run compare_old_vs_universal_fixed() to see the improvement\n")
print("3. Run quick_pattern_test_fixed() to see what patterns match\n")

def extract_metadata_from_filename(file_path: str) -> FilingMetadata:
    filename = Path(file_path).name
    file_id = filename.replace(".txt", "")
    parts = file_id.split('_')

    if len(parts) != 3:
        logger.warning(f"Malformed filename: {filename}. Using default metadata.")
        return FilingMetadata(
            ticker="UNKNOWN",
            form_type="UNKNOWN",
            filing_date="1900-01-01",
            fiscal_year=1900,
            fiscal_quarter=1,
            file_path=file_path
        )

    ticker, form_type, filing_date_str = parts[0], parts[1], parts[2] # Corrected unpacking

    try:
        filing_date = pd.to_datetime(filing_date_str)
        fiscal_year = filing_date.year
        fiscal_quarter = filing_date.quarter
    except pd.errors.ParserError:
        logger.error(f"Could not parse filing date from {filing_date_str} in {filename}. Using default values.")
        fiscal_year = 1900
        fiscal_quarter = 1

    if form_type == '10K' and filing_date.month <= 3:
        fiscal_year -= 1

    return FilingMetadata(
        ticker=ticker,
        form_type=form_type,
        filing_date=filing_date_str,
        fiscal_year=fiscal_year,
        fiscal_quarter=fiscal_quarter,
        file_path=file_path
    )


def test_universal_detection_fixed():
    """Test the universal detection on all your file types"""

    test_files = [
        "processed_filings/AAPL/AAPL_10K_2020-10-30.txt",
        "processed_filings/AMZN/AMZN_10K_2023-02-03.txt",
        "processed_filings/AMZN/AMZN_10Q_2024-11-01.txt",
        "processed_filings/KO/KO_10Q_2020-07-22.txt"
    ]

    results = {}

    for test_file in test_files:
        if not os.path.exists(test_file):
            print(f"⚠️ Skipping {test_file} - file not found\n")
            continue

        print(f"\n🧪 Testing: {test_file}\n")
        print("=" * 80)

        with open(test_file, 'r', encoding='utf-8') as f:
            content = f.read()

        sections = detect_sections_robust_universal(content)

        print(f"\n✅ Found {len(sections)} sections:\n")
        for i, section in enumerate(sections[:10]):
            print(f"  {i+1}. {section.title}\n")
            print(f"     Type: {section.section_type}, Length: {len(section.content):,} chars\n")

        chunks = process_filing_robust_universal(test_file)
        stats = validate_chunks(chunks) if chunks else {"error": "No chunks created"}

        results[test_file] = {
            'sections': len(sections),
            'chunks': len(chunks) if chunks else 0,
            'stats': stats
        }

        print(f"\n📊 Processing Results:\n")
        for key, value in stats.items():
            print(f"  {key}: {value}\n")

        if chunks:
            section_counts = {}
            for chunk in chunks[:20]:
                section = chunk.section_info
                section_counts[section] = section_counts.get(section, 0) + 1

            print(f"\n📚 Section Distribution (sample):\n")
            for section, count in sorted(section_counts.items()):
                print(f"  • {section}: {count} chunks\n")

    print(f"\n" + "="*80)
    print("📊 UNIVERSAL DETECTION SUMMARY\n")
    print("="*80)

    for file_path, result in results.items():
        filename = file_path.split('/')[-1]
        print(f"{filename:<25} | {result['sections']:>2} sections | {result['chunks']:>3} chunks\n")

    return results

def compare_old_vs_universal_fixed():
    """Compare the old detection vs universal detection"""
    test_file = "processed_filings/AAPL/AAPL_10K_2020-10-30.txt"

    if not os.path.exists(test_file):
        print("Test file not found for comparison\n")
        return

    print("⚖️ OLD vs UNIVERSAL Detection Comparison\n")
    print("="*60)

    with open(test_file, 'r', encoding='utf-8') as f:
        content = f.read()

    print("Running old detection...\n")
    old_sections = detect_sections_robust_old(content)

    print("Running universal detection...\n")
    new_sections = detect_sections_robust_universal(content)

    print(f"\n📊 Comparison Results:\n")
    print(f"  Old detection: {len(old_sections)} sections\n")
    print(f"  Universal detection: {len(new_sections)} sections\n")
    print(f"  Improvement: +{len(new_sections) - len(old_sections)} sections\n")

    print(f"\n📋 Old Sections:\n")
    for i, section in enumerate(old_sections):
        print(f"  {i+1}. {section.title}\n")

    print(f"\n📋 Universal Sections:\n")
    for i, section in enumerate(new_sections):
        print(f"  {i+1}. {section.title}\n")

    return old_sections, new_sections

def quick_pattern_test_fixed():
    """Quick test to see what patterns match in your content"""
    test_file = "processed_filings/AAPL/AAPL_10K_2020-10-30.txt"

    if not os.path.exists(test_file):
        print("Test file not found\n")
        return

    print("🔍 QUICK PATTERN TEST\n")
    print("="*50)

    with open(test_file, 'r', encoding='utf-8') as f:
        content = f.read()

    patterns = [
        (re.compile(r'\[TABLE_START\](?:.|\n)*?Item(?:.|\n)*?\[TABLE_END\]', re.I | re.DOTALL), "Table-wrapped Items"),
        (re.compile(r'Item\s+\d+[A-C]?\.\s*\|', re.I), "Pipe-separated Items"),
        (re.compile(r'PART\s+[IVX]+', re.I), "Part headers"),
        (re.compile(r'\[TABLE_START\](?:.|\n)*?PART(?:.|\n)*?\[TABLE_END\]', re.I | re.DOTALL), "Table-wrapped Parts"),
    ]

    for compiled_pattern, description in patterns:
        matches = compiled_pattern.findall(content)
        print(f"\n{description}: {len(matches)} matches\n")
        for i, match in enumerate(matches[:3]):
            clean_match = ' '.join(match.split())[:100]
            print(f"  {i+1}: {clean_match}...\n")

# Run the fixed tests
results_universal = test_universal_detection_fixed()
old_vs_new_sections = compare_old_vs_universal_fixed()
quick_pattern_test_fixed()

INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 0 unique sections:
INFO:__main__:Found table of contents (1367 chars)
INFO:__main__:Extracted 0 sections from table of contents:
INFO:__main__:Found 1 sections in AAPL_10K_2020-10-30.txt
INFO:__main__:Created 172 chunks for AAPL_10K_2020-10-30.txt
INFO:__main__:🔍 Improved detection found 0 potential sections:
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 0 unique sections:
INFO:__main__:Found table of contents (1367 chars)
INFO:__main__:Extracted 0 sections from table of contents:
INFO:__main__:Found 1 sections in AAPL_10K_2020-10-30.txt
INFO:__main__:Created 262 chunks for AAPL_10K_2020-10-30.txt
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 0 unique sections:
INFO:__main__:Found table of contents (1367 chars)
INFO:__main__:Extracted 0 sections from table of contents:
INF

🚀 SEC Filing Preprocessing Strategy - Ready for Testing!

Key improvements over original approach:

✅ Multi-strategy section detection with fallbacks

✅ Sentence-aware chunking with overlap

✅ Robust error handling and logging

✅ Structured data classes for better organization

✅ Quality validation and statistics

✅ Separate table and narrative processing

🧪 Testing with: processed_filings/AAPL/AAPL_10K_2020-10-30.txt

📊 Processing Results:

  total_chunks: 172

  avg_tokens: 379.86046511627904

  min_tokens: 38

  max_tokens: 1692

  chunks_with_overlap: 105

  table_chunks: 66

  narrative_chunks: 106

  unique_sections: 1


📝 Sample Chunks:


Chunk 1 (table):

  Section: Full Document

  Tokens: 58

  Text preview: California | 94-2404110 | (State or other jurisdiction | of incorporation or organization) | (I.R.S. Employer Identification No.) | One Apple Park Way | Cupertino | , | California | 95014 | (Address o...


Chunk 2 (table):

  Section: Full Document

  Tokens: 240

  Text 

INFO:__main__:Created 172 chunks for AAPL_10K_2020-10-30.txt
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 0 unique sections:
INFO:__main__:Found table of contents (1367 chars)
INFO:__main__:Extracted 0 sections from table of contents:
INFO:__main__:Found 1 sections in AAPL_10K_2020-10-30.txt
INFO:__main__:Created 127 chunks for AAPL_10K_2020-10-30.txt
ERROR:__main__:Error processing non_existent_file.txt: Unknown datetime string format, unable to parse: file, at position 0
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:Empty content provided to detect_sections_universal_sec. Returning empty sections.
INFO:__main__:Empty content provided to detect_sections_from_toc_universal. Returning empty sections.
ERROR:__main__:Error processing /var/folders/pj/bmp5122d3d77bzq_cvf0wbl40000gn/T/tmp34udbd2z_bad_name.txt: Unknown datetime string format, unable to parse: name, at position 0
INFO:__main__:Attempting univers

  Total chunks: 172

  Avg tokens: 379.9

  Overlap rate: 105/172


🧪 Testing: Large chunks, high overlap

  Total chunks: 127

  Avg tokens: 495.8

  Overlap rate: 60/127

🛡️ Testing Error Handling

Test 1: Non-existent file

  Result: 0 chunks (expected 0)


Test 2: Empty content

  Result: 1 sections


Test 3: Malformed filename

  Result: 0 chunks (expected 0)


Test 4: Very short text

  Result: 0 chunks

🔄 Testing Batch Processing (max 3 files)

Processing 3 files...

  1/3: AMZN_10Q_2022-04-29.txt

  2/3: AMZN_10Q_2020-05-01.txt

  3/3: AMZN_10Q_2020-10-30.txt


📊 Batch Processing Summary:

  Total files processed: 3

  Total chunks created: 440

  Average chunks per file: 146.7


📋 Per-file results:

  AMZN_10Q_2022-04-29.txt: 125 chunks, 1 sections, 51 tables

  AMZN_10Q_2020-05-01.txt: 195 chunks, 1 sections, 131 tables

  AMZN_10Q_2020-10-30.txt: 120 chunks, 1 sections, 48 tables

📈 Final Analysis Summary

🎯 Key Insights:

  • Document: AAPL 10K (FY2020)

  • Total chunks: 1

INFO:__main__:  13: Item/Part 9B - Other Information...
INFO:__main__:  14: Item/Part 11 - Executive Compensation...
INFO:__main__:  15: Item/Part 12 - Security Ownership of Certain Beneficial Owners and Manageme...
INFO:__main__:Universal detection successful (Strategy 1): Found 19 sections.
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 0 unique sections:
INFO:__main__:Found table of contents (1367 chars)
INFO:__main__:Extracted 0 sections from table of contents:
INFO:__main__:Found 1 sections in AAPL_10K_2020-10-30.txt
INFO:__main__:Created 172 chunks for AAPL_10K_2020-10-30.txt
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 21 unique sections:
INFO:__main__:  1: Item/Part 1 - Business...
INFO:__main__:  2: Item/Part 1A - Risk Factors...
INFO:__main__:  3: Item/Part 1B - Unresolved Staff Comments...
INFO:__main__:  4: Item/Part 2 - Properties...
INFO:__main__:  5: Item/Par


✅ Found 19 sections:

  1. Item 1 - BUSINESS

     Type: item, Length: 13,266 chars

  2. Item 1A - RISK FACTORS

     Type: item, Length: 61,136 chars

  3. Item 1B - UNRESOLVED STAFF COMMENTS

     Type: item, Length: 582 chars

  4. Item 3 - LEGAL PROCEEDINGS

     Type: item, Length: 898 chars

  5. Item 4 - MINE SAFETY DISCLOSURES

     Type: item, Length: 108 chars

  6. Item 5 - MARKET FOR REGISTRANT’S COMMON EQUITY, RELATED STOCKHOLDER MATTERS AND ISSUER PURCHASES OF EQUITY SECURITIES

     Type: item, Length: 4,182 chars

  7. Item 6 - SELECTED FINANCIAL DATA

     Type: item, Length: 1,745 chars

  8. Item 7 - MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIAL CONDITION AND RESULTS OF OPERATIONS

     Type: item, Length: 33,154 chars

  9. Item 7A - QUANTITATIVE AND QUALITATIVE DISCLOSURES ABOUT MARKET RISK

     Type: item, Length: 6,799 chars

  10. Item 8 - FINANCIAL STATEMENTS AND SUPPLEMENTARY DATA

     Type: item, Length: 103,042 chars


📊 Processing Results:

  total_

INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 11 unique sections:
INFO:__main__:  1: Item/Part 1 - Financial Statements...
INFO:__main__:  2: Item/Part unknown - Legal Proceedings...
INFO:__main__:  3: Item/Part 2 - Management’s Discussion and Analysis of Financial Condition ...
INFO:__main__:  4: Item/Part 3 - Quantitative and Qualitative Disclosures About Market Risk...
INFO:__main__:  5: Item/Part 4 - Controls and Procedures...
INFO:__main__:  6: Item/Part 1 - Legal Proceedings...
INFO:__main__:  7: Item/Part 1A - Risk Factors...
INFO:__main__:  8: Item/Part 2 - Unregistered Sales of Equity Securities and Use of Proceeds...
INFO:__main__:  9: Item/Part 3 - Defaults Upon Senior Securities...
INFO:__main__:  10: Item/Part 5 - Other Information...
INFO:__main__:  11: Item/Part 6 - Exhibits...
INFO:__main__:Universal detection successful (Strategy 1): Found 11 sections.
INFO:__main__:Attempting universal SEC section detection
INFO


✅ Found 11 sections:

  1. Item 1 - FINANCIAL STATEMENTS

     Type: item, Length: 34,940 chars

  2. Legal Proceedings

     Type: named_section, Length: 32,116 chars

  3. Item 2 - MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIAL CONDITION AND RESULTS OF OPERATIONS

     Type: item, Length: 45,107 chars

  4. Item 3 - QUANTITATIVE AND QUALITATIVE DISCLOSURES ABOUT MARKET RISK

     Type: item, Length: 4,405 chars

  5. Item 4 - CONTROLS AND PROCEDURES

     Type: item, Length: 2,104 chars

  6. Item 1 - LEGAL PROCEEDINGS

     Type: item, Length: 162 chars

  7. Item 1A - RISK FACTORS

     Type: item, Length: 59,433 chars

  8. Item 2 - UNREGISTERED SALES OF EQUITY SECURITIES AND USE OF PROCEEDS

     Type: item, Length: 103 chars

  9. Item 3 - DEFAULTS UPON SENIOR SECURITIES

     Type: item, Length: 153 chars

  10. Item 5 - OTHER INFORMATION

     Type: item, Length: 3,031 chars


📊 Processing Results:

  total_chunks: 132

  avg_tokens: 366.43939393939394

  min_tokens: 7

  