In [None]:
# SEC Filing Preprocessing Strategy - From Scratch
# Let's build a robust chunking approach step by step

import os
import re
import pandas as pd
import tiktoken
from typing import List, Dict, Any, Tuple, Optional
from dataclasses import dataclass
from datetime import datetime
import logging
from pathlib import Path

# Set up logging to see what's happening
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Initialize tokenizer for accurate token counting
encoding = tiktoken.encoding_for_model("text-embedding-3-small")

# =============================================================================
# 1. IMPROVED SEC MAPPINGS WITH FALLBACKS
# =============================================================================

# Keep the excellent domain knowledge from the original
ITEM_NAME_MAP_10K = {
    "1": "Business", 
    "1A": "Risk Factors", 
    "1B": "Unresolved Staff Comments", 
    "1C": "Cybersecurity",
    "2": "Properties", 
    "3": "Legal Proceedings", 
    "4": "Mine Safety Disclosures",
    "5": "Market for Registrant's Common Equity, Related Stockholder Matters and Issuer Purchases of Equity Securities",
    "6": "Reserved", 
    "7": "Management's Discussion and Analysis of Financial Condition and Results of Operations",
    "7A": "Quantitative and Qualitative Disclosures About Market Risk", 
    "8": "Financial Statements and Supplementary Data",
    "9": "Changes in and Disagreements With Accountants on Accounting and Financial Disclosure", 
    "9A": "Controls and Procedures",
    "9B": "Other Information", 
    "9C": "Disclosure Regarding Foreign Jurisdictions that Prevent Inspections",
    "10": "Directors, Executive Officers and Corporate Governance", 
    "11": "Executive Compensation",
    "12": "Security Ownership of Certain Beneficial Owners and Management and Related Stockholder Matters",
    "13": "Certain Relationships and Related Transactions, and Director Independence", 
    "14": "Principal Accountant Fees and Services",
    "15": "Exhibits, Financial Statement Schedules", 
    "16": "Form 10-K Summary"
}

ITEM_NAME_MAP_10Q_PART_I = {
    "1": "Financial Statements",
    "2": "Management's Discussion and Analysis of Financial Condition and Results of Operations",
    "3": "Quantitative and Qualitative Disclosures About Market Risk",
    "4": "Controls and Procedures",
}

ITEM_NAME_MAP_10Q_PART_II = {
    "1": "Legal Proceedings", "1A": "Risk Factors",
    "2": "Unregistered Sales of Equity Securities and Use of Proceeds",
    "3": "Defaults Upon Senior Securities", "4": "Mine Safety Disclosures",
    "5": "Other Information", "6": "Exhibits",
}

# =============================================================================
# 2. DATA STRUCTURES FOR BETTER ORGANIZATION
# =============================================================================

@dataclass
class FilingMetadata:
    """Structured metadata for a filing"""
    ticker: str
    form_type: str
    filing_date: str
    fiscal_year: int
    fiscal_quarter: int
    file_path: str

@dataclass
class DocumentSection:
    """Represents a section of the document"""
    title: str
    content: str
    section_type: str  # 'item', 'part', 'intro', 'table'
    item_number: Optional[str] = None
    part: Optional[str] = None
    start_pos: int = 0
    end_pos: int = 0

@dataclass
class Chunk:
    """Final chunk with all metadata"""
    chunk_id: str
    text: str
    token_count: int
    chunk_type: str  # 'narrative', 'table', 'mixed'
    section_info: str
    filing_metadata: FilingMetadata
    chunk_index: int
    has_overlap: bool = False

# =============================================================================
# 3. ROBUST TEXT CLEANING
# =============================================================================

def clean_sec_text(text: str) -> str:
    """
    Clean SEC filing text more robustly
    """
    # Remove common SEC artifacts
    text = re.sub(r'UNITED STATES\s+SECURITIES AND EXCHANGE COMMISSION.*?FORM \d+[A-Z]*', '', text, flags=re.DOTALL | re.IGNORECASE)
    
    # Handle page breaks more intelligently
    text = text.replace('[PAGE BREAK]', '\n\n--- PAGE BREAK ---\n\n')
    
    # Preserve table boundaries but clean them up
    text = re.sub(r'\[TABLE_START\]', '\n\n=== TABLE START ===\n', text)
    text = re.sub(r'\[TABLE_END\]', '\n=== TABLE END ===\n\n', text)
    
    # Clean up excessive whitespace but preserve paragraph structure
    text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text)  # Multiple newlines -> double newline
    text = re.sub(r'[ \t]+', ' ', text)  # Multiple spaces/tabs -> single space
    text = re.sub(r'^\s+|\s+$', '', text, flags=re.MULTILINE)  # Trim lines
    
    return text.strip()

# =============================================================================
# 4. MULTI-STRATEGY SECTION DETECTION
# =============================================================================



def detect_sections_strategy_1_improved(content: str) -> List[DocumentSection]:
    """
    Improved Strategy 1: Patterns based on real SEC filing structure
    """
    sections = []
    
    # Much more comprehensive patterns based on your actual files
    patterns = [
        # PART patterns - handle various formats
        r'(?im)^\s*PART\s+([IVX]+)(?:\s*[-–—].*?)?$',
        r'(?im)^PART\s+([IVX]+)(?:\s*[-–—].*?)?$',
        
        # ITEM patterns - much more flexible
        r'(?im)^\s*ITEM\s+(\d{1,2}[A-C]?)(?:\.|\s|[-–—])',
        r'(?im)^ITEM\s+(\d{1,2}[A-C]?)(?:\.|\s|[-–—])',
        r'(?im)Item\s+(\d{1,2}[A-C]?)(?:\.|\s|[-–—])',
        
        # Number-dot format common in SEC filings
        r'(?im)^(\d{1,2}[A-C]?)\.\s+[A-Z][A-Za-z\s]{10,}',
        
        # Content-based patterns for known sections
        r'(?im)^.{0,50}(BUSINESS)\s*$',
        r'(?im)^.{0,50}(RISK FACTORS)\s*$',
        r'(?im)^.{0,50}(LEGAL PROCEEDINGS)\s*$',
        r'(?im)^.{0,50}(FINANCIAL STATEMENTS)\s*$',
        r'(?im)^.{0,50}(MANAGEMENT.S DISCUSSION)\s*',
        r'(?im)^.{0,50}(PROPERTIES)\s*$',
        r'(?im)^.{0,50}(CONTROLS AND PROCEDURES)\s*$',
    ]
    
    all_matches = []
    
    # Process each pattern
    for pattern_idx, pattern in enumerate(patterns):
        for match in re.finditer(pattern, content):
            # Get the full line containing this match
            line_start = content.rfind('\n', 0, match.start()) + 1
            line_end = content.find('\n', match.end())
            if line_end == -1:
                line_end = len(content)
            
            full_line = content[line_start:line_end].strip()
            
            # Filter out obvious false positives
            if (len(full_line) > 400 or  # Too long to be a header
                len(full_line) < 3 or    # Too short
                '|' in full_line or      # Likely table content
                full_line.count(' ') > 20):  # Too many words
                continue
            
            # Extract section identifier
            section_id = match.group(1) if match.groups() else 'unknown'
            
            all_matches.append({
                'start_pos': line_start,
                'end_pos': line_end,
                'full_line': full_line,
                'section_id': section_id,
                'pattern_idx': pattern_idx,
                'match_start': match.start()
            })
    
    # Remove duplicates - matches within 200 characters of each other
    unique_matches = []
    for match in sorted(all_matches, key=lambda x: x['start_pos']):
        is_duplicate = any(
            abs(match['start_pos'] - existing['start_pos']) < 200 
            for existing in unique_matches
        )
        if not is_duplicate:
            unique_matches.append(match)
    
    # Debug output
    print(f"🔍 Improved detection found {len(unique_matches)} potential sections:")
    for i, match in enumerate(unique_matches[:15]):  # Show more for debugging
        print(f"  {i+1}: {match['full_line'][:80]}...")
    
    # Convert to DocumentSection objects
    for i, match in enumerate(unique_matches):
        start_pos = match['start_pos']
        end_pos = unique_matches[i + 1]['start_pos'] if i + 1 < len(unique_matches) else len(content)
        
        section_content = content[start_pos:end_pos].strip()
        
        # Determine section type and metadata
        full_line_upper = match['full_line'].upper()
        section_id = match['section_id'].upper() if match['section_id'] != 'unknown' else None
        
        if 'PART' in full_line_upper and section_id:
            section_type = 'part'
            part = f"PART {section_id}"
            item_number = None
            title = f"Part {section_id}"
        elif ('ITEM' in full_line_upper or re.match(r'^\d+[A-C]?$', str(section_id))) and section_id:
            section_type = 'item'
            part = None
            item_number = section_id
            title = f"Item {section_id}"
        elif any(keyword in full_line_upper for keyword in 
                ['BUSINESS', 'RISK', 'LEGAL', 'FINANCIAL', 'MANAGEMENT', 'PROPERTIES', 'CONTROLS']):
            section_type = 'named_section'
            part = None
            item_number = None
            title = match['full_line']
        else:
            section_type = 'content'
            part = None
            item_number = None
            title = match['full_line']
        
        sections.append(DocumentSection(
            title=title,
            content=section_content,
            section_type=section_type,
            item_number=item_number,
            part=part,
            start_pos=start_pos,
            end_pos=end_pos
        ))
    
    return sections

# Update the main detect_sections_robust function
def detect_sections_robust_fixed(content: str) -> List[DocumentSection]:
    """
    Updated robust detection with the fixed strategy
    """
    logger.info("Attempting Fixed Strategy 1: Improved regex-based section detection")
    sections = detect_sections_strategy_1_fixed(content)
    
    if len(sections) >= 3:  # Good result
        logger.info(f"Fixed Strategy 1 successful: Found {len(sections)} sections")
        return sections
    
    logger.warning("Fixed Strategy 1 found few sections, trying Strategy 2")
    sections = detect_sections_strategy_2(content)
    
    if len(sections) >= 2:
        logger.info(f"Strategy 2 successful: Found {len(sections)} sections")
        return sections
    
    logger.warning("All strategies failed, creating single section")
    return [DocumentSection(
        title="Full Document",
        content=content,
        section_type='document',
        start_pos=0,
        end_pos=len(content)
    )]

# Test the fixed detection
print("🧪 Testing fixed section detection...")
print("Add this to your notebook and replace the detect_sections_robust function")



def detect_sections_strategy_2(content: str) -> List[DocumentSection]:
    """
    Strategy 2: Fallback using page breaks and heuristics
    """
    sections = []
    
    # Split by page breaks first
    pages = content.split('--- PAGE BREAK ---')
    
    current_section = ""
    current_title = "Document Content"
    
    for i, page in enumerate(pages):
        page = page.strip()
        if not page:
            continue
            
        # Look for section headers in the page
        lines = page.split('\n')
        potential_headers = []
        
        for j, line in enumerate(lines[:10]):  # Check first 10 lines of each page
            line = line.strip()
            if (len(line) < 100 and  # Headers are usually short
                (re.search(r'\b(ITEM|PART)\b', line, re.IGNORECASE) or
                 re.search(r'\b(BUSINESS|RISK FACTORS|FINANCIAL STATEMENTS)\b', line, re.IGNORECASE))):
                potential_headers.append((j, line))
        
        if potential_headers:
            # Found a header, start new section
            if current_section:
                sections.append(DocumentSection(
                    title=current_title,
                    content=current_section.strip(),
                    section_type='content',
                    start_pos=0,
                    end_pos=len(current_section)
                ))
            
            current_title = potential_headers[0][1]
            current_section = page
        else:
            # Continue current section
            current_section += "\n\n" + page
    
    # Add the last section
    if current_section:
        sections.append(DocumentSection(
            title=current_title,
            content=current_section.strip(),
            section_type='content',
            start_pos=0,
            end_pos=len(current_section)
        ))
    
    return sections

def detect_sections_robust(content: str) -> List[DocumentSection]:
    """
    Multi-strategy section detection with fallbacks
    """
    logger.info("Attempting Strategy 1: Regex-based section detection")
    sections = detect_sections_strategy_1(content)
    
    if len(sections) >= 3:  # Good result
        logger.info(f"Strategy 1 successful: Found {len(sections)} sections")
        return sections
    
    logger.warning("Strategy 1 failed, trying Strategy 2: Page-based detection")
    sections = detect_sections_strategy_2(content)
    
    if len(sections) >= 2:
        logger.info(f"Strategy 2 successful: Found {len(sections)} sections")
        return sections
    
    logger.warning("All strategies failed, creating single section")
    return [DocumentSection(
        title="Full Document",
        content=content,
        section_type='document',
        start_pos=0,
        end_pos=len(content)
    )]

# =============================================================================
# 5. IMPROVED SENTENCE-AWARE CHUNKING
# =============================================================================

def split_into_sentences(text: str) -> List[str]:
    """
    Split text into sentences using multiple heuristics
    """
    # Simple sentence splitting (can be improved with spaCy/NLTK)
    sentences = re.split(r'(?<=[.!?])\s+(?=[A-Z])', text)
    
    # Clean up sentences
    sentences = [s.strip() for s in sentences if s.strip()]
    
    return sentences

def create_overlapping_chunks(text: str, target_tokens: int = 500, overlap_tokens: int = 100, 
                            min_tokens: int = 50) -> List[Dict[str, Any]]:
    """
    Create semantically aware chunks with overlap
    """
    sentences = split_into_sentences(text)
    chunks = []
    
    current_chunk_sentences = []
    current_tokens = 0
    
    for i, sentence in enumerate(sentences):
        sentence_tokens = len(encoding.encode(sentence))
        
        # If adding this sentence exceeds target, finalize current chunk
        if current_tokens + sentence_tokens > target_tokens and current_chunk_sentences:
            chunk_text = ' '.join(current_chunk_sentences)
            chunks.append({
                'text': chunk_text,
                'token_count': current_tokens,
                'sentence_count': len(current_chunk_sentences),
                'has_overlap': len(chunks) > 0
            })
            
            # Create overlap: keep last few sentences
            overlap_sentences = []
            overlap_tokens = 0
            
            # Add sentences from the end until we reach overlap target
            for sent in reversed(current_chunk_sentences):
                sent_tokens = len(encoding.encode(sent))
                if overlap_tokens + sent_tokens <= overlap_tokens:
                    overlap_sentences.insert(0, sent)
                    overlap_tokens += sent_tokens
                else:
                    break
            
            # Start new chunk with overlap + current sentence
            current_chunk_sentences = overlap_sentences + [sentence]
            current_tokens = overlap_tokens + sentence_tokens
        else:
            # Add sentence to current chunk
            current_chunk_sentences.append(sentence)
            current_tokens += sentence_tokens
    
    # Add final chunk if it has content
    if current_chunk_sentences:
        chunk_text = ' '.join(current_chunk_sentences)
        final_tokens = len(encoding.encode(chunk_text))
        
        if final_tokens >= min_tokens:
            chunks.append({
                'text': chunk_text,
                'token_count': final_tokens,
                'sentence_count': len(current_chunk_sentences),
                'has_overlap': len(chunks) > 0
            })
    
    return chunks

# =============================================================================
# 6. TABLE HANDLING
# =============================================================================

def extract_and_process_tables(content: str) -> Tuple[List[Dict], str]:
    """
    Extract tables and return both table chunks and narrative text
    """
    table_pattern = re.compile(r'=== TABLE START ===.*?=== TABLE END ===', re.DOTALL)
    tables = []
    
    # Find all tables
    for i, match in enumerate(table_pattern.finditer(content)):
        table_content = match.group(0)
        # Clean table markers
        table_text = table_content.replace('=== TABLE START ===', '').replace('=== TABLE END ===', '').strip()
        
        if table_text:  # Only add non-empty tables
            tables.append({
                'text': table_text,
                'token_count': len(encoding.encode(table_text)),
                'table_index': i,
                'chunk_type': 'table'
            })
    
    # Remove tables from content to get narrative text
    narrative_content = table_pattern.sub('', content).strip()
    
    return tables, narrative_content

# =============================================================================
# 7. MAIN PROCESSING FUNCTION
# =============================================================================

def process_filing_robust(file_path: str, target_tokens: int = 500, overlap_tokens: int = 100) -> List[Chunk]:
    """
    Main function that puts it all together
    """
    try:
        # Extract filing metadata
        filename = Path(file_path).name
        file_id = filename.replace(".txt", "")
        parts = file_id.split('_')
        
        if len(parts) != 3:
            logger.error(f"Invalid filename format: {filename}")
            return []
        
        ticker, form_type, filing_date_str = parts
        
        # Create filing metadata
        filing_date = pd.to_datetime(filing_date_str)
        fiscal_year = filing_date.year
        fiscal_quarter = filing_date.quarter
        
        # Adjust fiscal year for 10-K filings
        if form_type == '10K' and filing_date.month < 4:
            fiscal_year -= 1
        
        filing_metadata = FilingMetadata(
            ticker=ticker,
            form_type=form_type,
            filing_date=filing_date_str,
            fiscal_year=fiscal_year,
            fiscal_quarter=fiscal_quarter,
            file_path=file_path
        )
        
        # Read and clean content
        with open(file_path, 'r', encoding='utf-8') as f:
            raw_content = f.read()
        
        cleaned_content = clean_sec_text(raw_content)
        
        # Detect sections
        sections = detect_sections_robust(cleaned_content)
        logger.info(f"Found {len(sections)} sections in {filename}")
        
        # Process each section
        all_chunks = []
        chunk_counter = 0
        
        for section in sections:
            # Extract tables from this section
            tables, narrative_content = extract_and_process_tables(section.content)
            
            # Create section info string
            section_info = create_section_info(section, form_type)
            
            # Process tables
            for table in tables:
                chunk = Chunk(
                    chunk_id=f"{file_id}-chunk-{chunk_counter:04d}",
                    text=table['text'],
                    token_count=table['token_count'],
                    chunk_type='table',
                    section_info=section_info,
                    filing_metadata=filing_metadata,
                    chunk_index=chunk_counter,
                    has_overlap=False
                )
                all_chunks.append(chunk)
                chunk_counter += 1
            
            # Process narrative content
            if narrative_content.strip():
                narrative_chunks = create_overlapping_chunks(
                    narrative_content, target_tokens, overlap_tokens
                )
                
                for chunk_data in narrative_chunks:
                    chunk = Chunk(
                        chunk_id=f"{file_id}-chunk-{chunk_counter:04d}",
                        text=chunk_data['text'],
                        token_count=chunk_data['token_count'],
                        chunk_type='narrative',
                        section_info=section_info,
                        filing_metadata=filing_metadata,
                        chunk_index=chunk_counter,
                        has_overlap=chunk_data['has_overlap']
                    )
                    all_chunks.append(chunk)
                    chunk_counter += 1
        
        logger.info(f"Created {len(all_chunks)} chunks for {filename}")
        return all_chunks
        
    except Exception as e:
        logger.error(f"Error processing {file_path}: {e}")
        return []

def create_section_info(section: DocumentSection, form_type: str) -> str:
    """
    Create human-readable section information
    """
    if section.section_type == 'item' and section.item_number:
        if form_type == '10K':
            item_name = ITEM_NAME_MAP_10K.get(section.item_number, "Unknown Section")
            return f"Item {section.item_number} - {item_name}"
        elif form_type == '10Q':
            # Determine which part this item belongs to
            if section.item_number in ITEM_NAME_MAP_10Q_PART_I:
                item_name = ITEM_NAME_MAP_10Q_PART_I[section.item_number]
                return f"Part I, Item {section.item_number} - {item_name}"
            else:
                item_name = ITEM_NAME_MAP_10Q_PART_II.get(section.item_number, "Unknown Section")
                return f"Part II, Item {section.item_number} - {item_name}"
    
    elif section.section_type == 'part' and section.part:
        return section.part
    
    else:
        return section.title or "Document Content"

# =============================================================================
# 8. TESTING AND VALIDATION
# =============================================================================

def validate_chunks(chunks: List[Chunk]) -> Dict[str, Any]:
    """
    Validate the quality of our chunks
    """
    if not chunks:
        return {"error": "No chunks created"}
    
    token_counts = [chunk.token_count for chunk in chunks]
    
    stats = {
        "total_chunks": len(chunks),
        "avg_tokens": sum(token_counts) / len(token_counts),
        "min_tokens": min(token_counts),
        "max_tokens": max(token_counts),
        "chunks_with_overlap": sum(1 for chunk in chunks if chunk.has_overlap),
        "table_chunks": sum(1 for chunk in chunks if chunk.chunk_type == 'table'),
        "narrative_chunks": sum(1 for chunk in chunks if chunk.chunk_type == 'narrative'),
        "unique_sections": len(set(chunk.section_info for chunk in chunks))
    }
    
    return stats

# =============================================================================
# 9. LET'S TEST THIS!
# =============================================================================

print("🚀 SEC Filing Preprocessing Strategy - Ready for Testing!")
print("="*60)
print("Key improvements over original approach:")
print("✅ Multi-strategy section detection with fallbacks")
print("✅ Sentence-aware chunking with overlap")
print("✅ Robust error handling and logging")
print("✅ Structured data classes for better organization")
print("✅ Quality validation and statistics")
print("✅ Separate table and narrative processing")
print("="*60)

🧪 Testing fixed section detection...
Add this to your notebook and replace the detect_sections_robust function
🚀 SEC Filing Preprocessing Strategy - Ready for Testing!
Key improvements over original approach:
✅ Multi-strategy section detection with fallbacks
✅ Sentence-aware chunking with overlap
✅ Robust error handling and logging
✅ Structured data classes for better organization
✅ Quality validation and statistics
✅ Separate table and narrative processing


In [14]:
# Cell 1: Test with a single file
# =============================================================================
def test_single_file():
    """Test our preprocessing on a single file"""
    # Replace with an actual file path from your processed_filings directory
    test_file = "processed_filings/AAPL/AAPL_10K_2020-10-30.txt"
    
    if os.path.exists(test_file):
        print(f"🧪 Testing with: {test_file}")
        print("="*50)
        
        chunks = process_filing_robust(test_file)
        stats = validate_chunks(chunks)
        
        print("📊 Processing Results:")
        for key, value in stats.items():
            print(f"  {key}: {value}")
        
        print("\n📝 Sample Chunks:")
        for i, chunk in enumerate(chunks[:3]):  # Show first 3 chunks
            print(f"\nChunk {i+1} ({chunk.chunk_type}):")
            print(f"  Section: {chunk.section_info}")
            print(f"  Tokens: {chunk.token_count}")
            print(f"  Text preview: {chunk.text[:200]}...")
        
        return chunks
    else:
        print(f"❌ File not found: {test_file}")
        print("Please update the file path to match your data structure")
        return []

# Run the test
chunks = test_single_file()


INFO:__main__:Attempting Strategy 1: Regex-based section detection
INFO:__main__:Found 1 sections in AAPL_10K_2020-10-30.txt
INFO:__main__:Created 152 chunks for AAPL_10K_2020-10-30.txt


🧪 Testing with: processed_filings/AAPL/AAPL_10K_2020-10-30.txt
📊 Processing Results:
  total_chunks: 152
  avg_tokens: 365.07236842105266
  min_tokens: 38
  max_tokens: 1692
  chunks_with_overlap: 85
  table_chunks: 66
  narrative_chunks: 86
  unique_sections: 1

📝 Sample Chunks:

Chunk 1 (table):
  Section: Full Document
  Tokens: 58
  Text preview: California | 94-2404110 | (State or other jurisdiction | of incorporation or organization) | (I.R.S. Employer Identification No.) | One Apple Park Way | Cupertino | , | California | 95014 | (Address o...

Chunk 2 (table):
  Section: Full Document
  Tokens: 240
  Text preview: Title of each class | Trading symbol(s) | Name of each exchange on which registered | Common Stock, $0.00001 par value per share | AAPL | The Nasdaq Stock Market LLC | 1.000% Notes due 2022 | — | The ...

Chunk 3 (table):
  Section: Full Document
  Tokens: 41
  Text preview: Large accelerated filer | ☒ | Accelerated filer | ☐ | Non-accelerated filer | ☐ | Smaller repo

In [17]:
def compare_section_strategies(content_sample: str):
    """Compare how different strategies perform"""
    print("🔍 Comparing Section Detection Strategies")
    print("="*50)
    
    # Strategy 1: Robust regex
    sections_1 = detect_sections_strategy_1_improved(content_sample)
    print(f"Strategy 1 (Regex): {len(sections_1)} sections")
    for i, section in enumerate(sections_1[:5]):  # Show first 5
        print(f"  {i+1}. {section.title[:60]}...")
    
    print()
    
    # Strategy 2: Page-based fallback
    sections_2 = detect_sections_strategy_2(content_sample)
    print(f"Strategy 2 (Page-based): {len(sections_2)} sections")
    for i, section in enumerate(sections_2[:5]):  # Show first 5
        print(f"  {i+1}. {section.title[:60]}...")
    
    return sections_1, sections_2

# Test if we have chunks from previous test
if chunks:
    # Use the first chunk's filing to get the full content
    test_file = chunks[0].filing_metadata.file_path
    with open(test_file, 'r', encoding='utf-8') as f:
        sample_content = f.read()[:10000]  # First 10k characters
    
    sections_1, sections_2 = compare_section_strategies(sample_content)

🔍 Comparing Section Detection Strategies
🔍 Improved detection found 1 potential sections:
  1: PART I...
Strategy 1 (Regex): 1 sections
  1. Part I...

Strategy 2 (Page-based): 1 sections
  1. Document Content...


In [4]:
def analyze_chunking_quality(chunks: List[Chunk]):
    """Deep dive into chunk quality"""
    if not chunks:
        print("No chunks to analyze")
        return
    
    print("📊 Chunking Quality Analysis")
    print("="*50)
    
    # Token distribution
    token_counts = [chunk.token_count for chunk in chunks]
    
    print(f"Token Distribution:")
    print(f"  Mean: {sum(token_counts)/len(token_counts):.1f}")
    print(f"  Median: {sorted(token_counts)[len(token_counts)//2]}")
    print(f"  Min: {min(token_counts)}")
    print(f"  Max: {max(token_counts)}")
    
    # Chunk types
    chunk_types = {}
    for chunk in chunks:
        chunk_types[chunk.chunk_type] = chunk_types.get(chunk.chunk_type, 0) + 1
    
    print(f"\nChunk Types:")
    for chunk_type, count in chunk_types.items():
        print(f"  {chunk_type}: {count}")
    
    # Section distribution
    sections = {}
    for chunk in chunks:
        sections[chunk.section_info] = sections.get(chunk.section_info, 0) + 1
    
    print(f"\nSection Distribution:")
    for section, count in sorted(sections.items()):
        print(f"  {section}: {count} chunks")
    
    # Overlap analysis
    overlap_count = sum(1 for chunk in chunks if chunk.has_overlap)
    print(f"\nOverlap Analysis:")
    print(f"  Chunks with overlap: {overlap_count}/{len(chunks)} ({overlap_count/len(chunks)*100:.1f}%)")
    
    return {
        'token_stats': {
            'mean': sum(token_counts)/len(token_counts),
            'median': sorted(token_counts)[len(token_counts)//2],
            'min': min(token_counts),
            'max': max(token_counts)
        },
        'chunk_types': chunk_types,
        'sections': sections,
        'overlap_rate': overlap_count/len(chunks)
    }

# Analyze our test chunks
if chunks:
    quality_analysis = analyze_chunking_quality(chunks)

📊 Chunking Quality Analysis
Token Distribution:
  Mean: 365.1
  Median: 435
  Min: 38
  Max: 1692

Chunk Types:
  table: 66
  narrative: 86

Section Distribution:
  Full Document: 152 chunks

Overlap Analysis:
  Chunks with overlap: 85/152 (55.9%)


In [5]:
def test_chunking_parameters():
    """Test different parameter combinations"""
    if not chunks:
        print("No test file processed yet")
        return
    
    test_file = chunks[0].filing_metadata.file_path
    
    print("🔧 Testing Different Chunking Parameters")
    print("="*50)
    
    # Test different parameter combinations
    param_configs = [
        {"target_tokens": 300, "overlap_tokens": 50, "name": "Small chunks, low overlap"},
        {"target_tokens": 500, "overlap_tokens": 100, "name": "Medium chunks, medium overlap"},
        {"target_tokens": 800, "overlap_tokens": 150, "name": "Large chunks, high overlap"},
    ]
    
    results = {}
    
    for config in param_configs:
        print(f"\n🧪 Testing: {config['name']}")
        test_chunks = process_filing_robust(
            test_file, 
            target_tokens=config['target_tokens'],
            overlap_tokens=config['overlap_tokens']
        )
        
        stats = validate_chunks(test_chunks)
        results[config['name']] = stats
        
        print(f"  Total chunks: {stats['total_chunks']}")
        print(f"  Avg tokens: {stats['avg_tokens']:.1f}")
        print(f"  Overlap rate: {stats['chunks_with_overlap']}/{stats['total_chunks']}")
    
    return results

# Test different parameters
param_results = test_chunking_parameters()

INFO:__main__:Attempting Strategy 1: Regex-based section detection
INFO:__main__:Found 1 sections in AAPL_10K_2020-10-30.txt
INFO:__main__:Created 214 chunks for AAPL_10K_2020-10-30.txt
INFO:__main__:Attempting Strategy 1: Regex-based section detection
INFO:__main__:Found 1 sections in AAPL_10K_2020-10-30.txt
INFO:__main__:Created 152 chunks for AAPL_10K_2020-10-30.txt
INFO:__main__:Attempting Strategy 1: Regex-based section detection
INFO:__main__:Found 1 sections in AAPL_10K_2020-10-30.txt
INFO:__main__:Created 117 chunks for AAPL_10K_2020-10-30.txt


🔧 Testing Different Chunking Parameters

🧪 Testing: Small chunks, low overlap
  Total chunks: 214
  Avg tokens: 259.3
  Overlap rate: 147/214

🧪 Testing: Medium chunks, medium overlap
  Total chunks: 152
  Avg tokens: 365.1
  Overlap rate: 85/152

🧪 Testing: Large chunks, high overlap
  Total chunks: 117
  Avg tokens: 474.3
  Overlap rate: 50/117


In [6]:
def test_error_handling():
    """Test how our system handles various edge cases"""
    print("🛡️ Testing Error Handling")
    print("="*50)
    
    # Test 1: Non-existent file
    print("Test 1: Non-existent file")
    fake_chunks = process_filing_robust("non_existent_file.txt")
    print(f"  Result: {len(fake_chunks)} chunks (expected 0)")
    
    # Test 2: Empty file
    print("\nTest 2: Empty content")
    empty_sections = detect_sections_robust("")
    print(f"  Result: {len(empty_sections)} sections")
    
    # Test 3: Malformed filename
    print("\nTest 3: Malformed filename")
    # Create a temporary file with bad name
    import tempfile
    with tempfile.NamedTemporaryFile(mode='w', suffix='_bad_name.txt', delete=False) as f:
        f.write("Some content")
        temp_file = f.name
    
    bad_chunks = process_filing_robust(temp_file)
    print(f"  Result: {len(bad_chunks)} chunks (expected 0)")
    
    # Clean up
    os.unlink(temp_file)
    
    # Test 4: Very short text
    print("\nTest 4: Very short text")
    short_chunks = create_overlapping_chunks("Short text.", target_tokens=500)
    print(f"  Result: {len(short_chunks)} chunks")

test_error_handling()

ERROR:__main__:Error processing non_existent_file.txt: Unknown datetime string format, unable to parse: file, at position 0
INFO:__main__:Attempting Strategy 1: Regex-based section detection
ERROR:__main__:Error processing /var/folders/pj/bmp5122d3d77bzq_cvf0wbl40000gn/T/tmp4dqo7j8s_bad_name.txt: Unknown datetime string format, unable to parse: name, at position 0


🛡️ Testing Error Handling
Test 1: Non-existent file
  Result: 0 chunks (expected 0)

Test 2: Empty content
  Result: 1 sections

Test 3: Malformed filename
  Result: 0 chunks (expected 0)

Test 4: Very short text
  Result: 0 chunks


In [7]:
def test_batch_processing(max_files: int = 5):
    """Test processing multiple files"""
    print(f"🔄 Testing Batch Processing (max {max_files} files)")
    print("="*50)
    
    data_path = "processed_filings/"
    if not os.path.exists(data_path):
        print(f"❌ Data path not found: {data_path}")
        return []
    
    # Get all files
    all_files = []
    for root, dirs, files in os.walk(data_path):
        for file in files:
            if file.endswith('.txt'):
                all_files.append(os.path.join(root, file))
    
    # Process a subset
    test_files = all_files[:max_files]
    print(f"Processing {len(test_files)} files...")
    
    all_results = []
    
    for i, file_path in enumerate(test_files):
        print(f"  {i+1}/{len(test_files)}: {os.path.basename(file_path)}")
        
        file_chunks = process_filing_robust(file_path)
        stats = validate_chunks(file_chunks)
        
        all_results.append({
            'file': os.path.basename(file_path),
            'chunks': len(file_chunks),
            'avg_tokens': stats.get('avg_tokens', 0),
            'sections': stats.get('unique_sections', 0),
            'tables': stats.get('table_chunks', 0)
        })
    
    # Summary statistics
    print(f"\n📊 Batch Processing Summary:")
    total_chunks = sum(r['chunks'] for r in all_results)
    avg_chunks_per_file = total_chunks / len(all_results) if all_results else 0
    
    print(f"  Total files processed: {len(all_results)}")
    print(f"  Total chunks created: {total_chunks}")
    print(f"  Average chunks per file: {avg_chunks_per_file:.1f}")
    
    print(f"\n📋 Per-file results:")
    for result in all_results:
        print(f"  {result['file']}: {result['chunks']} chunks, {result['sections']} sections, {result['tables']} tables")
    
    return all_results

# Run batch test
batch_results = test_batch_processing(max_files=3)

INFO:__main__:Attempting Strategy 1: Regex-based section detection
INFO:__main__:Found 1 sections in AMZN_10Q_2022-04-29.txt
INFO:__main__:Created 109 chunks for AMZN_10Q_2022-04-29.txt
INFO:__main__:Attempting Strategy 1: Regex-based section detection
INFO:__main__:Found 1 sections in AMZN_10Q_2020-05-01.txt
INFO:__main__:Created 183 chunks for AMZN_10Q_2020-05-01.txt
INFO:__main__:Attempting Strategy 1: Regex-based section detection
INFO:__main__:Found 1 sections in AMZN_10Q_2020-10-30.txt
INFO:__main__:Created 106 chunks for AMZN_10Q_2020-10-30.txt


🔄 Testing Batch Processing (max 3 files)
Processing 3 files...
  1/3: AMZN_10Q_2022-04-29.txt
  2/3: AMZN_10Q_2020-05-01.txt
  3/3: AMZN_10Q_2020-10-30.txt

📊 Batch Processing Summary:
  Total files processed: 3
  Total chunks created: 398
  Average chunks per file: 132.7

📋 Per-file results:
  AMZN_10Q_2022-04-29.txt: 109 chunks, 1 sections, 51 tables
  AMZN_10Q_2020-05-01.txt: 183 chunks, 1 sections, 131 tables
  AMZN_10Q_2020-10-30.txt: 106 chunks, 1 sections, 48 tables


In [8]:
def create_analysis_summary():
    """Create a comprehensive summary of our preprocessing"""
    print("📈 Final Analysis Summary")
    print("="*60)
    
    if not chunks:
        print("No chunks to analyze - run test_single_file() first")
        return
    
    # Create a mini dataset for analysis
    chunk_data = []
    for chunk in chunks:
        chunk_data.append({
            'chunk_id': chunk.chunk_id,
            'tokens': chunk.token_count,
            'type': chunk.chunk_type,
            'section': chunk.section_info,
            'has_overlap': chunk.has_overlap,
            'ticker': chunk.filing_metadata.ticker,
            'form_type': chunk.filing_metadata.form_type,
            'fiscal_year': chunk.filing_metadata.fiscal_year
        })
    
    df = pd.DataFrame(chunk_data)
    
    print("🎯 Key Insights:")
    print(f"  • Document: {df['ticker'].iloc[0]} {df['form_type'].iloc[0]} (FY{df['fiscal_year'].iloc[0]})")
    print(f"  • Total chunks: {len(df)}")
    print(f"  • Average chunk size: {df['tokens'].mean():.0f} tokens")
    print(f"  • Size range: {df['tokens'].min()} - {df['tokens'].max()} tokens")
    print(f"  • Overlap rate: {(df['has_overlap'].sum() / len(df) * 100):.1f}%")
    
    print(f"\n📊 Chunk Distribution by Type:")
    type_dist = df['type'].value_counts()
    for chunk_type, count in type_dist.items():
        percentage = (count / len(df)) * 100
        print(f"  • {chunk_type}: {count} chunks ({percentage:.1f}%)")
    
    print(f"\n📚 Section Breakdown:")
    section_dist = df['section'].value_counts()
    for section, count in section_dist.head(8).items():  # Top 8 sections
        print(f"  • {section}: {count} chunks")
    
    # Quality metrics
    print(f"\n✅ Quality Metrics:")
    
    # Check for very small chunks (potential issues)
    small_chunks = df[df['tokens'] < 50]
    print(f"  • Very small chunks (<50 tokens): {len(small_chunks)} ({len(small_chunks)/len(df)*100:.1f}%)")
    
    # Check for very large chunks (might need splitting)
    large_chunks = df[df['tokens'] > 800]
    print(f"  • Large chunks (>800 tokens): {len(large_chunks)} ({len(large_chunks)/len(df)*100:.1f}%)")
    
    # Check section coverage
    unique_sections = df['section'].nunique()
    print(f"  • Unique sections identified: {unique_sections}")
    
    # Show some example chunks for manual review
    print(f"\n🔍 Sample Chunks for Review:")
    
    # Show one of each type
    for chunk_type in df['type'].unique():
        sample = df[df['type'] == chunk_type].iloc[0]
        chunk_obj = next(c for c in chunks if c.chunk_id == sample['chunk_id'])
        print(f"\n  {chunk_type.upper()} example ({sample['tokens']} tokens):")
        print(f"    Section: {sample['section']}")
        print(f"    Preview: {chunk_obj.text[:150]}...")
    
    return df

# Create final summary
summary_df = create_analysis_summary()

📈 Final Analysis Summary
🎯 Key Insights:
  • Document: AAPL 10K (FY2020)
  • Total chunks: 152
  • Average chunk size: 365 tokens
  • Size range: 38 - 1692 tokens
  • Overlap rate: 55.9%

📊 Chunk Distribution by Type:
  • narrative: 86 chunks (56.6%)
  • table: 66 chunks (43.4%)

📚 Section Breakdown:
  • Full Document: 152 chunks

✅ Quality Metrics:
  • Very small chunks (<50 tokens): 2 (1.3%)
  • Large chunks (>800 tokens): 3 (2.0%)
  • Unique sections identified: 1

🔍 Sample Chunks for Review:

  TABLE example (58 tokens):
    Section: Full Document
    Preview: California | 94-2404110 | (State or other jurisdiction | of incorporation or organization) | (I.R.S. Employer Identification No.) | One Apple Park Way...

  NARRATIVE example (420 tokens):
    Section: Full Document
    Preview: aapl-20200926-K(Mark One)☒ ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934For the fiscal year ended September 26,...


In [9]:
def compare_with_original():
    """Compare our approach with the original chunking strategy"""
    print("⚖️ Comparison: New vs Original Approach")
    print("="*60)
    
    improvements = [
        "✅ Multi-strategy section detection (fallbacks for robustness)",
        "✅ Sentence-aware chunking (preserves semantic boundaries)",
        "✅ Overlapping chunks (maintains context across boundaries)",
        "✅ Separate table processing (handles structured data better)",
        "✅ Comprehensive error handling (graceful degradation)",
        "✅ Rich metadata structure (better for search/filtering)",
        "✅ Quality validation (ensures chunk coherence)",
        "✅ Configurable parameters (tunable for different use cases)"
    ]
    
    potential_tradeoffs = [
        "⚠️ Slightly more complex code (but more maintainable)",
        "⚠️ More chunks due to overlap (but better retrieval)",
        "⚠️ Processing takes longer (but more robust results)"
    ]
    
    print("🚀 Key Improvements:")
    for improvement in improvements:
        print(f"  {improvement}")
    
    print(f"\n⚖️ Potential Tradeoffs:")
    for tradeoff in potential_tradeoffs:
        print(f"  {tradeoff}")
    
    print(f"\n🎯 Recommended Next Steps:")
    next_steps = [
        "1. Test on more diverse filings to validate robustness",
        "2. Fine-tune chunking parameters based on embedding performance",
        "3. Add semantic similarity checks between overlapping chunks",
        "4. Implement incremental processing for large datasets",
        "5. Add support for other SEC forms (8-K, DEF 14A, etc.)",
        "6. Create embedding quality metrics and evaluation"
    ]
    
    for step in next_steps:
        print(f"  {step}")

compare_with_original()

print("\n" + "="*60)
print("🎉 Preprocessing Strategy Testing Complete!")
print("="*60)
print("Next step: Convert this notebook into modular Python files")
print("Then: Implement the embedding pipeline and MCP server!")
print("="*60)

⚖️ Comparison: New vs Original Approach
🚀 Key Improvements:
  ✅ Multi-strategy section detection (fallbacks for robustness)
  ✅ Sentence-aware chunking (preserves semantic boundaries)
  ✅ Overlapping chunks (maintains context across boundaries)
  ✅ Separate table processing (handles structured data better)
  ✅ Comprehensive error handling (graceful degradation)
  ✅ Rich metadata structure (better for search/filtering)
  ✅ Quality validation (ensures chunk coherence)
  ✅ Configurable parameters (tunable for different use cases)

⚖️ Potential Tradeoffs:
  ⚠️ Slightly more complex code (but more maintainable)
  ⚠️ More chunks due to overlap (but better retrieval)
  ⚠️ Processing takes longer (but more robust results)

🎯 Recommended Next Steps:
  1. Test on more diverse filings to validate robustness
  2. Fine-tune chunking parameters based on embedding performance
  3. Add semantic similarity checks between overlapping chunks
  4. Implement incremental processing for large datasets
  5. A

In [18]:
def detect_sections_universal_sec(content: str) -> List[DocumentSection]:
    """
    Universal section detection for SEC filings with table-based formatting
    """
    sections = []
    
    # Universal patterns for table-formatted SEC filings
    patterns = [
        # Table-based ITEM patterns (most common in your files)
        r'(?i)\[TABLE_START\]\s*Item\s+(\d{1,2}[A-C]?)\.\s*\|\s*([^\[]+?)\s*\[TABLE_END\]',
        r'(?i)\[TABLE_START\]\s*Item\s+(\d{1,2}[A-C]?)\.\s*\|\s*([^|]+)',
        
        # Table-based PART patterns
        r'(?i)\[TABLE_START\]\s*PART\s+([IVX]+)\s*\|\s*([^\[]+?)\s*\[TABLE_END\]',
        r'(?i)\[TABLE_START\]\s*PART\s+([IVX]+)\s*\|\s*([^|]+)',
        r'(?i)\[TABLE_START\]\s*PART\s+([IVX]+)\s*\[TABLE_END\]',
        
        # Standalone ITEM patterns (fallback)
        r'(?i)^\s*Item\s+(\d{1,2}[A-C]?)\.\s*([^\n]+)',
        r'(?i)Item\s+(\d{1,2}[A-C]?)\.\s*\|\s*([^|]+)',
        
        # Standalone PART patterns (fallback)
        r'(?i)^\s*PART\s+([IVX]+)\s*([^\n]*)',
        r'(?i)PART\s+([IVX]+)\s*\|\s*([^|]+)',
        
        # Number-only patterns in tables
        r'(?i)\[TABLE_START\]\s*(\d{1,2}[A-C]?)\.\s*\|\s*([^|]+)',
        
        # Headers that might appear standalone
        r'(?i)^(Item\s+\d{1,2}[A-C]?\.\s+[^|]+?)$',
        r'(?i)^(PART\s+[IVX]+)(?:\s*[-–—]\s*(.+))?$',
    ]
    
    all_matches = []
    
    for pattern_idx, pattern in enumerate(patterns):
        for match in re.finditer(pattern, content, re.MULTILINE):
            # Get context around the match
            line_start = content.rfind('\n', 0, match.start()) + 1
            line_end = content.find('\n', match.end())
            if line_end == -1:
                line_end = len(content)
            
            full_line = content[line_start:line_end].strip()
            
            # Skip if this looks like metadata or page headers
            if any(skip_word in full_line.lower() for skip_word in 
                   ['page', 'signature', 'exhibit', 'index', 'table of contents']):
                continue
                
            # Skip very long lines that are probably not headers
            if len(full_line) > 500:
                continue
            
            # Extract section information
            groups = match.groups()
            
            if len(groups) >= 2 and groups[1]:
                section_id = groups[0].strip()
                section_title = groups[1].strip()
                # Clean up section title
                section_title = re.sub(r'\[TABLE_END\].*', '', section_title).strip()
                section_title = section_title.replace('|', '').strip()
            elif len(groups) >= 1:
                section_id = groups[0].strip()
                section_title = f"Section {section_id}"
            else:
                section_id = 'unknown'
                section_title = full_line
            
            all_matches.append({
                'start_pos': line_start,
                'end_pos': line_end,
                'full_line': full_line,
                'section_id': section_id,
                'section_title': section_title,
                'pattern_idx': pattern_idx,
                'match_start': match.start()
            })
    
    # Remove duplicates - matches within 100 characters
    unique_matches = []
    for match in sorted(all_matches, key=lambda x: x['start_pos']):
        is_duplicate = any(
            abs(match['start_pos'] - existing['start_pos']) < 100 
            for existing in unique_matches
        )
        if not is_duplicate:
            unique_matches.append(match)
    
    # Remove very similar section IDs (e.g., multiple "1" entries)
    final_matches = []
    seen_section_ids = set()
    for match in unique_matches:
        section_key = f"{match['section_id'].upper()}_{match['section_title'][:20]}"
        if section_key not in seen_section_ids:
            final_matches.append(match)
            seen_section_ids.add(section_key)
    
    print(f"🔍 Universal SEC detection found {len(final_matches)} unique sections:")
    for i, match in enumerate(final_matches[:15]):
        print(f"  {i+1}: Item/Part {match['section_id']} - {match['section_title'][:60]}...")
    
    # Convert to DocumentSection objects
    for i, match in enumerate(final_matches):
        start_pos = match['start_pos']
        end_pos = final_matches[i + 1]['start_pos'] if i + 1 < len(final_matches) else len(content)
        
        section_content = content[start_pos:end_pos].strip()
        
        # Determine section type and metadata
        section_id = match['section_id'].upper()
        
        if re.match(r'^[IVX]+$', section_id):
            # This is a PART
            section_type = 'part'
            part = f"PART {section_id}"
            item_number = None
            title = f"Part {section_id}"
            if match['section_title'] and match['section_title'] != f"Section {section_id}":
                title = f"Part {section_id} - {match['section_title']}"
        elif re.match(r'^\d+[A-C]?$', section_id):
            # This is an ITEM
            section_type = 'item'
            part = None
            item_number = section_id
            title = f"Item {section_id}"
            if match['section_title'] and match['section_title'] != f"Section {section_id}":
                title = f"Item {section_id} - {match['section_title']}"
        else:
            # Unknown section type
            section_type = 'content'
            part = None
            item_number = None
            title = match['section_title'] if match['section_title'] else match['full_line']
        
        sections.append(DocumentSection(
            title=title,
            content=section_content,
            section_type=section_type,
            item_number=item_number,
            part=part,
            start_pos=start_pos,
            end_pos=end_pos
        ))
    
    return sections

def detect_sections_from_toc_universal(content: str) -> List[DocumentSection]:
    """
    Extract sections from table of contents - works for any SEC filing
    """
    sections = []
    
    # Look for table of contents patterns
    toc_patterns = [
        r'(?i)INDEX.*?(?=\[PAGE BREAK\])',
        r'(?i)TABLE OF CONTENTS.*?(?=\[PAGE BREAK\])',
        r'(?i)FORM 10-[KQ].*?INDEX.*?(?=\[PAGE BREAK\])',
        r'(?i)\[TABLE_START\].*?Page.*?\[TABLE_END\].*?(?=\[PAGE BREAK\])',
    ]
    
    toc_content = ""
    for pattern in toc_patterns:
        match = re.search(pattern, content, re.DOTALL)
        if match:
            toc_content = match.group(0)
            break
    
    if not toc_content:
        print("No table of contents found")
        return sections
    
    print(f"Found table of contents ({len(toc_content)} chars)")
    
    # Extract sections from TOC using multiple patterns
    item_patterns = [
        # Standard table format: Item 1. | Business | Page
        r'(?i)Item\s+(\d{1,2}[A-C]?)\.\s*\|\s*([^|]+?)\s*\|\s*\d+',
        r'(?i)PART\s+([IVX]+)\s*\|\s*([^|]+)',
        # Alternative formats
        r'(?i)Item\s+(\d{1,2}[A-C]?)\.\s+([^|\d]+)',
        r'(?i)(\d{1,2}[A-C]?)\.\s*\|\s*([^|]+?)\s*\|\s*\d+',
    ]
    
    found_items = []
    for pattern in item_patterns:
        for match in re.finditer(pattern, toc_content):
            groups = match.groups()
            if len(groups) >= 2:
                item_id = groups[0].strip()
                item_title = groups[1].strip()
                # Clean up the title
                item_title = re.sub(r'\s+', ' ', item_title)
                found_items.append((item_id, item_title))
    
    # Remove duplicates
    unique_items = []
    seen = set()
    for item_id, title in found_items:
        key = f"{item_id}_{title[:20]}"
        if key not in seen:
            unique_items.append((item_id, title))
            seen.add(key)
    
    print(f"Extracted {len(unique_items)} sections from table of contents:")
    for item_id, title in unique_items[:10]:
        print(f"  • {item_id}: {title[:50]}...")
    
    return sections  # For now, just return the found items info

def detect_sections_robust_universal(content: str) -> List[DocumentSection]:
    """
    Universal robust section detection for all SEC filings
    """
    logger.info("Attempting universal SEC section detection")
    
    # Strategy 1: Direct pattern matching for table-formatted sections
    sections = detect_sections_universal_sec(content)
    
    if len(sections) >= 3:
        logger.info(f"Universal detection successful: Found {len(sections)} sections")
        return sections
    
    # Strategy 2: Table of contents analysis
    logger.warning("Direct detection found few sections, analyzing table of contents")
    detect_sections_from_toc_universal(content)  # For debugging info
    
    # Strategy 3: Page-based fallback
    logger.warning("Trying page-based detection as fallback")
    sections = detect_sections_strategy_2(content)
    
    if len(sections) >= 2:
        logger.info(f"Page-based detection successful: Found {len(sections)} sections")
        return sections
    
    # Final fallback
    logger.warning("All strategies failed, creating single section")
    return [DocumentSection(
        title="Full Document",
        content=content,
        section_type='document',
        start_pos=0,
        end_pos=len(content)
    )]

# Universal processing function
def process_filing_robust_universal(file_path: str, target_tokens: int = 500, overlap_tokens: int = 100) -> List[Chunk]:
    """
    Universal processing function for all SEC filings
    """
    try:
        # Extract filing metadata
        filename = Path(file_path).name
        file_id = filename.replace(".txt", "")
        parts = file_id.split('_')
        
        if len(parts) != 3:
            logger.error(f"Invalid filename format: {filename}")
            return []
        
        ticker, form_type, filing_date_str = parts
        
        # Create filing metadata
        filing_date = pd.to_datetime(filing_date_str)
        fiscal_year = filing_date.year
        fiscal_quarter = filing_date.quarter
        
        if form_type == '10K' and filing_date.month < 4:
            fiscal_year -= 1
        
        filing_metadata = FilingMetadata(
            ticker=ticker,
            form_type=form_type,
            filing_date=filing_date_str,
            fiscal_year=fiscal_year,
            fiscal_quarter=fiscal_quarter,
            file_path=file_path
        )
        
        # Read and clean content
        with open(file_path, 'r', encoding='utf-8') as f:
            raw_content = f.read()
        
        cleaned_content = clean_sec_text(raw_content)
        
        # Use universal section detection
        sections = detect_sections_robust_universal(cleaned_content)
        logger.info(f"Found {len(sections)} sections in {filename}")
        
        # Process each section
        all_chunks = []
        chunk_counter = 0
        
        for section in sections:
            # Extract tables from this section
            tables, narrative_content = extract_and_process_tables(section.content)
            
            # Create section info string
            section_info = create_section_info_improved(section, form_type)
            
            # Process tables
            for table in tables:
                chunk = Chunk(
                    chunk_id=f"{file_id}-chunk-{chunk_counter:04d}",
                    text=table['text'],
                    token_count=table['token_count'],
                    chunk_type='table',
                    section_info=section_info,
                    filing_metadata=filing_metadata,
                    chunk_index=chunk_counter,
                    has_overlap=False
                )
                all_chunks.append(chunk)
                chunk_counter += 1
            
            # Process narrative content
            if narrative_content.strip():
                narrative_chunks = create_overlapping_chunks(
                    narrative_content, target_tokens, overlap_tokens
                )
                
                for chunk_data in narrative_chunks:
                    chunk = Chunk(
                        chunk_id=f"{file_id}-chunk-{chunk_counter:04d}",
                        text=chunk_data['text'],
                        token_count=chunk_data['token_count'],
                        chunk_type='narrative',
                        section_info=section_info,
                        filing_metadata=filing_metadata,
                        chunk_index=chunk_counter,
                        has_overlap=chunk_data['has_overlap']
                    )
                    all_chunks.append(chunk)
                    chunk_counter += 1
        
        logger.info(f"Created {len(all_chunks)} chunks for {filename}")
        return all_chunks
        
    except Exception as e:
        logger.error(f"Error processing {file_path}: {e}")
        return []

print("🚀 Universal SEC section detection ready!")
print("This should work for all your SEC filings with table-based formatting")
print("Replace process_filing_robust with process_filing_robust_universal")

🚀 Universal SEC section detection ready!
This should work for all your SEC filings with table-based formatting
Replace process_filing_robust with process_filing_robust_universal


In [19]:
# =============================================================================
# TEST UNIVERSAL SEC DETECTION
# =============================================================================

def test_universal_detection():
    """Test the universal detection on all your file types"""
    
    # Test different files to verify universal approach
    test_files = [
        "processed_filings/AAPL/AAPL_10K_2020-10-30.txt",
        "processed_filings/AMZN/AMZN_10K_2023-02-03.txt", 
        "processed_filings/AMZN/AMZN_10Q_2024-11-01.txt",
        "processed_filings/KO/KO_10Q_2020-07-22.txt"  # If you have this one
    ]
    
    results = {}
    
    for test_file in test_files:
        if not os.path.exists(test_file):
            print(f"⚠️ Skipping {test_file} - file not found")
            continue
            
        print(f"\n🧪 Testing: {test_file}")
        print("=" * 80)
        
        with open(test_file, 'r', encoding='utf-8') as f:
            content = f.read()
        
        # Test universal detection
        sections = detect_sections_robust_universal(content)
        
        print(f"\n✅ Found {len(sections)} sections:")
        for i, section in enumerate(sections[:10]):  # Show first 10
            print(f"  {i+1}. {section.title}")
            print(f"     Type: {section.section_type}, Length: {len(section.content):,} chars")
        
        # Test full pipeline
        chunks = process_filing_robust_universal(test_file)
        stats = validate_chunks(chunks) if chunks else {"error": "No chunks created"}
        
        results[test_file] = {
            'sections': len(sections),
            'chunks': len(chunks) if chunks else 0,
            'stats': stats
        }
        
        print(f"\n📊 Processing Results:")
        for key, value in stats.items():
            print(f"  {key}: {value}")
        
        if chunks:
            # Show section distribution
            section_counts = {}
            for chunk in chunks[:20]:  # Sample first 20
                section = chunk.section_info
                section_counts[section] = section_counts.get(section, 0) + 1
            
            print(f"\n📚 Section Distribution (sample):")
            for section, count in sorted(section_counts.items()):
                print(f"  • {section}: {count} chunks")
    
    # Summary comparison
    print(f"\n" + "="*80)
    print("📊 UNIVERSAL DETECTION SUMMARY")
    print("="*80)
    
    for file_path, result in results.items():
        filename = file_path.split('/')[-1]
        print(f"{filename:<25} | {result['sections']:>2} sections | {result['chunks']:>3} chunks")
    
    return results

def compare_old_vs_universal():
    """Compare the old detection vs universal detection"""
    test_file = "processed_filings/AAPL/AAPL_10K_2020-10-30.txt"
    
    if not os.path.exists(test_file):
        print("Test file not found for comparison")
        return
    
    print("⚖️ OLD vs UNIVERSAL Detection Comparison")
    print("="*60)
    
    with open(test_file, 'r', encoding='utf-8') as f:
        content = f.read()
    
    # Old detection
    print("Running old detection...")
    old_sections = detect_sections_robust(content)
    
    # New universal detection  
    print("Running universal detection...")
    new_sections = detect_sections_robust_universal(content)
    
    print(f"\n📊 Comparison Results:")
    print(f"  Old detection: {len(old_sections)} sections")
    print(f"  Universal detection: {len(new_sections)} sections")
    print(f"  Improvement: +{len(new_sections) - len(old_sections)} sections")
    
    print(f"\n📋 Old Sections:")
    for i, section in enumerate(old_sections):
        print(f"  {i+1}. {section.title}")
    
    print(f"\n📋 Universal Sections:")
    for i, section in enumerate(new_sections):
        print(f"  {i+1}. {section.title}")
    
    return old_sections, new_sections

def quick_pattern_test():
    """Quick test to see what patterns match in your content"""
    test_file = "processed_filings/AAPL/AAPL_10K_2020-10-30.txt"
    
    if not os.path.exists(test_file):
        print("Test file not found")
        return
    
    with open(test_file, 'r', encoding='utf-8') as f:
        content = f.read()
    
    print("🔍 QUICK PATTERN TEST")
    print("="*50)
    
    # Test key patterns
    patterns = [
        (r'\[TABLE_START\].*?Item.*?\[TABLE_END\]', "Table-wrapped Items"),
        (r'Item\s+\d+[A-C]?\.\s*\|', "Pipe-separated Items"),
        (r'PART\s+[IVX]+', "Part headers"),
        (r'\[TABLE_START\].*?PART.*?\[TABLE_END\]', "Table-wrapped Parts"),
    ]
    
    for pattern, description in patterns:
        matches = re.findall(pattern, content, re.IGNORECASE | re.DOTALL)
        print(f"\n{description}: {len(matches)} matches")
        for i, match in enumerate(matches[:3]):
            # Clean up match for display
            clean_match = ' '.join(match.split())[:100]
            print(f"  {i+1}: {clean_match}...")

print("🚀 Ready to test universal SEC detection!")
print("\n1. Run test_universal_detection() to test all files")
print("2. Run compare_old_vs_universal() to see the improvement") 
print("3. Run quick_pattern_test() to see what patterns match")

🚀 Ready to test universal SEC detection!

1. Run test_universal_detection() to test all files
2. Run compare_old_vs_universal() to see the improvement
3. Run quick_pattern_test() to see what patterns match


In [22]:
# =============================================================================
# MISSING FUNCTION FIX - Add this to your notebook
# =============================================================================

def create_section_info_improved(section_title: str, section_type: str = "unknown") -> str:
    """
    Create standardized section info for chunks
    """
    if not section_title or section_title.strip() == "":
        return "Full Document"
    
    # Clean up the section title
    clean_title = section_title.strip()
    
    # Remove redundant prefixes
    clean_title = re.sub(r'^(Item/Part\s+)', '', clean_title)
    clean_title = re.sub(r'^(Part\s+[IVX]+\s+-\s+)', '', clean_title)
    
    # Standardize format
    if clean_title.startswith("Item "):
        return clean_title
    elif clean_title.startswith("Part "):
        return clean_title
    elif section_type == "item":
        return f"Item {clean_title}"
    elif section_type == "part":
        return f"Part {clean_title}"
    else:
        return clean_title

In [23]:
# Test the universal detection
results = test_universal_detection()

# Compare old vs new
old_sections, new_sections = compare_old_vs_universal()

# See what patterns actually match
quick_pattern_test()

INFO:__main__:Attempting universal SEC section detection
INFO:__main__:Universal detection successful: Found 19 sections
INFO:__main__:Attempting universal SEC section detection



🧪 Testing: processed_filings/AAPL/AAPL_10K_2020-10-30.txt
🔍 Universal SEC detection found 19 unique sections:
  1: Item/Part I - Item 1.    Business...
  2: Item/Part 1A - Risk Factors...
  3: Item/Part 1B - Unresolved Staff Comments...
  4: Item/Part 3 - Legal Proceedings...
  5: Item/Part 4 - Mine Safety Disclosures...
  6: Item/Part II - Item 5.    Market for Registrant’s Common Equity, Related St...
  7: Item/Part 6 - Selected Financial Data...
  8: Item/Part 7 - Management’s Discussion and Analysis of Financial Condition ...
  9: Item/Part 7A - Quantitative and Qualitative Disclosures About Market Risk...
  10: Item/Part 8 - Financial Statements and Supplementary Data...
  11: Item/Part 9 - Changes in and Disagreements with Accountants on Accounting ...
  12: Item/Part 9A - Controls and Procedures...
  13: Item/Part 9B - Other Information...
  14: Item/Part 11 - Executive Compensation...
  15: Item/Part 12 - Security Ownership of Certain Beneficial Owners and Manageme...

✅ Found

INFO:__main__:Found 1 sections in AAPL_10K_2020-10-30.txt
ERROR:__main__:Error processing processed_filings/AAPL/AAPL_10K_2020-10-30.txt: 'DocumentSection' object has no attribute 'strip'
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:Universal detection successful: Found 20 sections
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:Found 1 sections in AMZN_10K_2023-02-03.txt
ERROR:__main__:Error processing processed_filings/AMZN/AMZN_10K_2023-02-03.txt: 'DocumentSection' object has no attribute 'strip'
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:Universal detection successful: Found 9 sections



📊 Processing Results:
  error: No chunks created

🧪 Testing: processed_filings/AMZN/AMZN_10K_2023-02-03.txt
🔍 Universal SEC detection found 20 unique sections:
  1: Item/Part I - [TABLE_START]...
  2: Item/Part 1A - Risk Factors...
  3: Item/Part 1B - Unresolved Staff Comments...
  4: Item/Part 2 - Properties...
  5: Item/Part 3 - Legal Proceedings...
  6: Item/Part 4 - Mine Safety Disclosures...
  7: Item/Part II - [TABLE_START]...
  8: Item/Part 6 - Reserved...
  9: Item/Part 7A - Quantitative and Qualitative Disclosures About Market Risk...
  10: Item/Part 8 - Financial Statements and Supplementary Data...
  11: Item/Part 9 - Changes in and Disagreements with Accountants On Accounting ...
  12: Item/Part 9A - Controls and Procedures...
  13: Item/Part 9B - Other Information...
  14: Item/Part III - [TABLE_START]...
  15: Item/Part 11 - Executive Compensation...

✅ Found 20 sections:
  1. Part I - [TABLE_START]
     Type: part, Length: 13,293 chars
  2. Item 1A - Risk Factors
     T

INFO:__main__:Attempting universal SEC section detection
INFO:__main__:Found 1 sections in AMZN_10Q_2024-11-01.txt
ERROR:__main__:Error processing processed_filings/AMZN/AMZN_10Q_2024-11-01.txt: 'DocumentSection' object has no attribute 'strip'
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:Universal detection successful: Found 7 sections
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:Found 1 sections in KO_10Q_2020-07-22.txt
ERROR:__main__:Error processing processed_filings/KO/KO_10Q_2020-07-22.txt: 'DocumentSection' object has no attribute 'strip'
INFO:__main__:Attempting Strategy 1: Regex-based section detection
INFO:__main__:Strategy 1 successful: Found 25 sections



✅ Found 9 sections:
  1. Part I - . FINANCIAL INFORMATION
     Type: part, Length: 67,088 chars
  2. Item 2 - Management’s Discussion and Analysis of Financial Condition and Results of Operations
     Type: item, Length: 45,106 chars
  3. Item 3 - Quantitative and Qualitative Disclosures About Market Risk
     Type: item, Length: 4,404 chars
  4. Item 4 - Controls and Procedures
     Type: item, Length: 2,075 chars
  5. Part II - . OTHER INFORMATION
     Type: part, Length: 189 chars
  6. Item 1A - Risk Factors
     Type: item, Length: 59,432 chars
  7. Item 2 - Unregistered Sales of Equity Securities and Use of Proceeds
     Type: item, Length: 102 chars
  8. Item 3 - Defaults Upon Senior Securities
     Type: item, Length: 152 chars
  9. Item 5 - Other Information
     Type: item, Length: 5,327 chars
🔍 Universal SEC detection found 0 unique sections:
No table of contents found

📊 Processing Results:
  error: No chunks created

🧪 Testing: processed_filings/KO/KO_10Q_2020-07-22.txt
🔍 

INFO:__main__:Attempting universal SEC section detection
INFO:__main__:Universal detection successful: Found 19 sections


Running universal detection...
🔍 Universal SEC detection found 19 unique sections:
  1: Item/Part I - Item 1.    Business...
  2: Item/Part 1A - Risk Factors...
  3: Item/Part 1B - Unresolved Staff Comments...
  4: Item/Part 3 - Legal Proceedings...
  5: Item/Part 4 - Mine Safety Disclosures...
  6: Item/Part II - Item 5.    Market for Registrant’s Common Equity, Related St...
  7: Item/Part 6 - Selected Financial Data...
  8: Item/Part 7 - Management’s Discussion and Analysis of Financial Condition ...
  9: Item/Part 7A - Quantitative and Qualitative Disclosures About Market Risk...
  10: Item/Part 8 - Financial Statements and Supplementary Data...
  11: Item/Part 9 - Changes in and Disagreements with Accountants on Accounting ...
  12: Item/Part 9A - Controls and Procedures...
  13: Item/Part 9B - Other Information...
  14: Item/Part 11 - Executive Compensation...
  15: Item/Part 12 - Security Ownership of Certain Beneficial Owners and Manageme...

📊 Comparison Results:
  Old detecti

In [21]:
# =============================================================================
# MISSING FUNCTION FIX - Add this to your notebook
# =============================================================================

def create_section_info_improved(section_title: str, section_type: str = "unknown") -> str:
    """
    Create standardized section info for chunks
    """
    if not section_title or section_title.strip() == "":
        return "Full Document"
    
    # Clean up the section title
    clean_title = section_title.strip()
    
    # Remove redundant prefixes
    clean_title = re.sub(r'^(Item/Part\s+)', '', clean_title)
    clean_title = re.sub(r'^(Part\s+[IVX]+\s+-\s+)', '', clean_title)
    
    # Standardize format
    if clean_title.startswith("Item "):
        return clean_title
    elif clean_title.startswith("Part "):
        return clean_title
    elif section_type == "item":
        return f"Item {clean_title}"
    elif section_type == "part":
        return f"Part {clean_title}"
    else:
        return clean_title

def process_filing_robust_universal(file_path: str) -> List[ChunkWithMetadata]:
    """
    Updated processing function with the missing function included
    """
    try:
        logger.info(f"Processing {file_path}")
        
        # Read file
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
        
        # Extract metadata
        metadata = extract_metadata_from_filename(file_path)
        
        # Detect sections using universal approach
        sections = detect_sections_robust_universal(content)
        logger.info(f"Found {len(sections)} sections in {os.path.basename(file_path)}")
        
        # Process each section
        all_chunks = []
        
        for section in sections:
            # Create chunks for this section
            section_chunks = create_chunks_from_section(
                section.content, 
                chunk_size=512, 
                overlap=0.2
            )
            
            # Add metadata to each chunk
            for chunk in section_chunks:
                chunk_with_meta = ChunkWithMetadata(
                    content=chunk.content,
                    metadata={
                        **metadata,
                        'section_info': create_section_info_improved(section.title, section.section_type),
                        'section_type': section.section_type,
                        'start_pos': chunk.start_pos,
                        'end_pos': chunk.end_pos,
                        'chunk_index': len(all_chunks),
                        'token_count': chunk.token_count,
                        'content_type': chunk.content_type
                    }
                )
                all_chunks.append(chunk_with_meta)
        
        return all_chunks
        
    except Exception as e:
        logger.error(f"Error processing {file_path}: {str(e)}")
        return []

# Test the fix
def test_fixed_processing():
    """Test that the missing function fix works"""
    
    test_files = [
        "processed_filings/AAPL/AAPL_10K_2020-10-30.txt",
        "processed_filings/AMZN/AMZN_10K_2023-02-03.txt"
    ]
    
    for test_file in test_files:
        if not os.path.exists(test_file):
            print(f"⚠️ Skipping {test_file}")
            continue
            
        print(f"\n🧪 Testing FIXED processing: {test_file}")
        print("=" * 70)
        
        # Process with fixed function
        chunks = process_filing_robust_universal(test_file)
        
        if chunks:
            print(f"✅ SUCCESS: Created {len(chunks)} chunks!")
            
            # Show chunk distribution by section
            section_counts = {}
            for chunk in chunks:
                section = chunk.metadata.get('section_info', 'Unknown')
                section_counts[section] = section_counts.get(section, 0) + 1
            
            print(f"\n📚 Chunk Distribution by Section:")
            for section, count in sorted(section_counts.items()):
                print(f"  • {section}: {count} chunks")
            
            # Show sample chunks
            print(f"\n📄 Sample Chunks:")
            for i, chunk in enumerate(chunks[:3]):
                section = chunk.metadata.get('section_info', 'Unknown')
                content_preview = ' '.join(chunk.content.split()[:15])
                print(f"  {i+1}. [{section}] {content_preview}...")
                
        else:
            print(f"❌ FAILED: No chunks created")
    
    return chunks if 'chunks' in locals() else []

print("🚀 Missing function fix added!")
print("\nRun test_fixed_processing() to verify the fix works!")

NameError: name 'ChunkWithMetadata' is not defined

In [28]:
import os
import re
import pandas as pd
import tiktoken
from typing import List, Dict, Any, Tuple, Optional
from dataclasses import dataclass
from datetime import datetime
import logging
from pathlib import Path

# Set up logging to see what's happening
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Initialize tokenizer for accurate token counting
encoding = tiktoken.encoding_for_model("text-embedding-3-small")

# =============================================================================
# 1. SEC MAPPINGS WITH FALLBACKS
# =============================================================================

ITEM_NAME_MAP_10K = {
    "1": "Business",
    "1A": "Risk Factors",
    "1B": "Unresolved Staff Comments",
    "1C": "Cybersecurity",
    "2": "Properties",
    "3": "Legal Proceedings",
    "4": "Mine Safety Disclosures",
    "5": "Market for Registrant's Common Equity, Related Stockholder Matters and Issuer Purchases of Equity Securities",
    "6": "Reserved",
    "7": "Management's Discussion and Analysis of Financial Condition and Results of Operations",
    "7A": "Quantitative and Qualitative Disclosures About Market Risk",
    "8": "Financial Statements and Supplementary Data",
    "9": "Changes in and Disagreements With Accountants on Accounting and Financial Disclosure",
    "9A": "Controls and Procedures",
    "9B": "Other Information",
    "9C": "Disclosure Regarding Foreign Jurisdictions that Prevent Inspections",
    "10": "Directors, Executive Officers and Corporate Governance",
    "11": "Executive Compensation",
    "12": "Security Ownership of Certain Beneficial Owners and Management and Related Stockholder Matters",
    "13": "Certain Relationships and Related Transactions, and Director Independence",
    "14": "Principal Accountant Fees and Services",
    "15": "Exhibits, Financial Statement Schedules",
    "16": "Form 10-K Summary"
}

ITEM_NAME_MAP_10Q_PART_I = {
    "1": "Financial Statements",
    "2": "Management's Discussion and Analysis of Financial Condition and Results of Operations",
    "3": "Quantitative and Qualitative Disclosures About Market Risk",
    "4": "Controls and Procedures",
}

ITEM_NAME_MAP_10Q_PART_II = {
    "1": "Legal Proceedings", "1A": "Risk Factors",
    "2": "Unregistered Sales of Equity Securities and Use of Proceeds",
    "3": "Defaults Upon Senior Securities", "4": "Mine Safety Disclosures",
    "5": "Other Information", "6": "Exhibits",
}

# =============================================================================
# 2. DATA STRUCTURES FOR BETTER ORGANIZATION
# =============================================================================

@dataclass
class FilingMetadata:
    """Structured metadata for a filing"""
    ticker: str
    form_type: str
    filing_date: str
    fiscal_year: int
    fiscal_quarter: int
    file_path: str

@dataclass
class DocumentSection:
    """Represents a section of the document"""
    title: str
    content: str
    section_type: str  # 'item', 'part', 'intro', 'table'
    item_number: Optional[str] = None
    part: Optional[str] = None
    start_pos: int = 0
    end_pos: int = 0

@dataclass
class Chunk:
    """Final chunk with all metadata"""
    chunk_id: str
    text: str
    token_count: int
    chunk_type: str  # 'narrative', 'table', 'mixed'
    section_info: str
    filing_metadata: FilingMetadata
    chunk_index: int
    has_overlap: bool = False

# =============================================================================
# 3. ROBUST TEXT CLEANING
# =============================================================================

def clean_sec_text(text: str) -> str:
    """
    Clean SEC filing text more robustly
    """
    # Remove common SEC artifacts
    text = re.sub(r'UNITED STATES\s+SECURITIES AND EXCHANGE COMMISSION.*?FORM \d+[A-Z]*', '', text, flags=re.DOTALL | re.IGNORECASE)

    # Handle page breaks more intelligently
    text = text.replace('[PAGE BREAK]', '\n\n--- PAGE BREAK ---\n\n')

    # Preserve table boundaries but clean them up
    text = re.sub(r'\[TABLE_START\]', '\n\n=== TABLE START ===\n', text)
    text = re.sub(r'\[TABLE_END\]', '\n=== TABLE END ===\n\n', text)

    # Clean up excessive whitespace but preserve paragraph structure
    text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text)  # Multiple newlines -> double newline
    text = re.sub(r'[ \t]+', ' ', text)  # Multiple spaces/tabs -> single space
    text = re.sub(r'^\s+|\s+$', '', text, flags=re.MULTILINE)  # Trim lines

    return text.strip()

# =============================================================================
# 4. MULTI-STRATEGY SECTION DETECTION
# =============================================================================

def detect_sections_strategy_1_improved(content: str) -> List[DocumentSection]:
    """
    Improved Strategy 1: Patterns based on real SEC filing structure
    """
    sections = []

    # Much more comprehensive patterns based on your actual files
    patterns = [
        # PART patterns - handle various formats
        r'(?im)^\s*PART\s+([IVX]+)(?:\s*[-–—].*?)?$',
        r'(?im)^PART\s+([IVX]+)(?:\s*[-–—].*?)?$',

        # ITEM patterns - much more flexible
        r'(?im)^\s*ITEM\s+(\d{1,2}[A-C]?)(?:\.|\s|[-–—])',
        r'(?im)^ITEM\s+(\d{1,2}[A-C]?)(?:\.|\s|[-–—])',
        r'(?im)Item\s+(\d{1,2}[A-C]?)(?:\.|\s|[-–—])',

        # Number-dot format common in SEC filings
        r'(?im)^(\d{1,2}[A-C]?)\.\s+[A-Z][A-Za-z\s]{10,}',

        # Content-based patterns for known sections
        r'(?im)^.{0,50}(BUSINESS)\s*$',
        r'(?im)^.{0,50}(RISK FACTORS)\s*$',
        r'(?im)^.{0,50}(LEGAL PROCEEDINGS)\s*$',
        r'(?im)^.{0,50}(FINANCIAL STATEMENTS)\s*$',
        r'(?im)^.{0,50}(MANAGEMENT.S DISCUSSION)\\s*',
        r'(?im)^.{0,50}(PROPERTIES)\s*$',
        r'(?im)^.{0,50}(CONTROLS AND PROCEDURES)\s*$',
    ]

    all_matches = []

    # Process each pattern
    for pattern_idx, pattern in enumerate(patterns):
        for match in re.finditer(pattern, content):
            # Get the full line containing this match
            line_start = content.rfind('\n', 0, match.start()) + 1
            line_end = content.find('\n', match.end())
            if line_end == -1:
                line_end = len(content)

            full_line = content[line_start:line_end].strip()

            # Filter out obvious false positives
            if (len(full_line) > 400 or  # Too long to be a header
                len(full_line) < 3 or    # Too short
                '|' in full_line or      # Likely table content
                full_line.count(' ') > 20):  # Too many words
                continue

            # Extract section identifier
            section_id = match.group(1) if match.groups() else 'unknown'

            all_matches.append({
                'start_pos': line_start,
                'end_pos': line_end,
                'full_line': full_line,
                'section_id': section_id,
                'pattern_idx': pattern_idx,
                'match_start': match.start()
            })

    # Remove duplicates - matches within 200 characters of each other
    unique_matches = []
    for match in sorted(all_matches, key=lambda x: x['start_pos']):
        is_duplicate = any(
            abs(match['start_pos'] - existing['start_pos']) < 200
            for existing in unique_matches
        )
        if not is_duplicate:
            unique_matches.append(match)

    # Debug output
    print(f"🔍 Improved detection found {len(unique_matches)} potential sections:")
    for i, match in enumerate(unique_matches[:15]):  # Show more for debugging
        print(f"  {i+1}: {match['full_line'][:80]}...")

    # Convert to DocumentSection objects
    for i, match in enumerate(unique_matches):
        start_pos = match['start_pos']
        end_pos = unique_matches[i + 1]['start_pos'] if i + 1 < len(unique_matches) else len(content)

        section_content = content[start_pos:end_pos].strip()

        # Determine section type and metadata
        full_line_upper = match['full_line'].upper()
        section_id = match['section_id'].upper() if match['section_id'] != 'unknown' else None

        if 'PART' in full_line_upper and section_id:
            section_type = 'part'
            part = f"PART {section_id}"
            item_number = None
            title = f"Part {section_id}"
        elif ('ITEM' in full_line_upper or re.match(r'^\d+[A-C]?$', str(section_id))) and section_id:
            section_type = 'item'
            part = None
            item_number = section_id
            title = f"Item {section_id}"
        elif any(keyword in full_line_upper for keyword in
                ['BUSINESS', 'RISK', 'LEGAL', 'FINANCIAL', 'MANAGEMENT', 'PROPERTIES', 'CONTROLS']):
            section_type = 'named_section'
            part = None
            item_number = None
            title = match['full_line']
        else:
            section_type = 'content'
            part = None
            item_number = None
            title = match['full_line']

        sections.append(DocumentSection(
            title=title,
            content=section_content,
            section_type=section_type,
            item_number=item_number,
            part=part,
            start_pos=start_pos,
            end_pos=end_pos
        ))

    return sections

def detect_sections_strategy_2(content: str) -> List[DocumentSection]:
    """
    Strategy 2: Fallback using page breaks and heuristics
    """
    sections = []

    # Split by page breaks first
    pages = content.split('--- PAGE BREAK ---')

    current_section = ""
    current_title = "Document Content"

    for i, page in enumerate(pages):
        page = page.strip()
        if not page:
            continue

        # Look for section headers in the page
        lines = page.split('\n')
        potential_headers = []

        for j, line in enumerate(lines[:10]):  # Check first 10 lines of each page
            line = line.strip()
            if (len(line) < 100 and  # Headers are usually short
                (re.search(r'\b(ITEM|PART)\b', line, re.IGNORECASE) or
                 re.search(r'\b(BUSINESS|RISK FACTORS|FINANCIAL STATEMENTS)\b', line, re.IGNORECASE))):
                potential_headers.append((j, line))

        if potential_headers:
            # Found a header, start new section
            if current_section:
                sections.append(DocumentSection(
                    title=current_title,
                    content=current_section.strip(),
                    section_type='content',
                    start_pos=0,
                    end_pos=len(current_section)
                ))

            current_title = potential_headers[0][1]
            current_section = page
        else:
            # Continue current section
            current_section += "\n\n" + page

    # Add the last section
    if current_section:
        sections.append(DocumentSection(
            title=current_title,
            content=current_section.strip(),
            section_type='content',
            start_pos=0,
            end_pos=len(current_section)
        ))

    return sections

# The `detect_sections_robust` function from your original code (renamed detect_sections_robust_old to avoid conflict)
def detect_sections_robust_old(content: str) -> List[DocumentSection]:
    """
    Multi-strategy section detection with fallbacks (original version)
    """
    logger.info("Attempting Strategy 1: Regex-based section detection")
    sections = detect_sections_strategy_1_improved(content) # Original called detect_sections_strategy_1, updated to _improved

    if len(sections) >= 3:  # Good result
        logger.info(f"Strategy 1 successful: Found {len(sections)} sections")
        return sections

    logger.warning("Strategy 1 failed, trying Strategy 2: Page-based detection")
    sections = detect_sections_strategy_2(content)

    if len(sections) >= 2:
        logger.info(f"Strategy 2 successful: Found {len(sections)} sections")
        return sections

    logger.warning("All strategies failed, creating single section")
    return [DocumentSection(
        title="Full Document",
        content=content,
        section_type='document',
        start_pos=0,
        end_pos=len(content)
    )]

def create_section_info(section: DocumentSection, form_type: str) -> str:
    """
    Create human-readable section information
    """
    if section.section_type == 'item' and section.item_number:
        if form_type == '10K':
            item_name = ITEM_NAME_MAP_10K.get(section.item_number, "Unknown Section")
            return f"Item {section.item_number} - {item_name}"
        elif form_type == '10Q':
            # Determine which part this item belongs to
            if section.item_number in ITEM_NAME_MAP_10Q_PART_I:
                item_name = ITEM_NAME_MAP_10Q_PART_I[section.item_number]
                return f"Part I, Item {section.item_number} - {item_name}"
            else:
                item_name = ITEM_NAME_MAP_10Q_PART_II.get(section.item_number, "Unknown Section")
                return f"Part II, Item {section.item_number} - {item_name}"

    elif section.section_type == 'part' and section.part:
        return section.part

    else:
        return section.title or "Document Content"

def detect_sections_universal_sec(content: str) -> List[DocumentSection]:
    """
    Universal section detection for SEC filings with table-based formatting
    """
    sections = []

    if not content: # ADD THIS CHECK
        print("Empty content provided to detect_sections_universal_sec")
        return sections

    # Universal patterns for table-formatted SEC filings
    patterns = [
        # Table-based ITEM patterns (most common in your files)
        r'(?i)\[TABLE_START\]\s*Item\s+(\d{1,2}[A-C]?)\.\s*\|\s*([^\[]+?)\s*\[TABLE_END\]',
        r'(?i)\[TABLE_START\]\s*Item\s+(\d{1,2}[A-C]?)\.\s*\|\s*([^|]+)',

        # Table-based PART patterns
        r'(?i)\[TABLE_START\]\s*PART\s+([IVX]+)\s*\|\s*([^\[]+?)\s*\[TABLE_END\]',
        r'(?i)\[TABLE_START\]\s*PART\s+([IVX]+)\s*\|\s*([^|]+)',
        r'(?i)\[TABLE_START\]\s*PART\s+([IVX]+)\s*\[TABLE_END\]',

        # Standalone ITEM patterns (fallback)
        r'(?i)^\s*Item\s+(\d{1,2}[A-C]?)\.\s*([^\n]+)',
        r'(?i)Item\s+(\d{1,2}[A-C]?)\.\s*\|\s*([^|]+)',

        # Standalone PART patterns (fallback)
        r'(?i)^\s*PART\s+([IVX]+)\s*([^\n]*)',
        r'(?i)PART\s+([IVX]+)\s*\|\s*([^|]+)',

        # Number-only patterns in tables
        r'(?i)\[TABLE_START\]\s*(\d{1,2}[A-C]?)\.\s*\|\s*([^|]+)',

        # Headers that might appear standalone
        r'(?i)^(Item\\s+\\d{1,2}[A-C]?\\.\\s+[^|]+?)$',
        r'(?i)^(PART\\s+[IVX]+)(?:\\s*[-–—]\\s*(.+))?$',
    ]

    all_matches = []

    for pattern_idx, pattern in enumerate(patterns):
        for match in re.finditer(pattern, content, re.MULTILINE):
            # Get context around the match
            line_start = content.rfind('\n', 0, match.start()) + 1
            line_end = content.find('\n', match.end())
            if line_end == -1:
                line_end = len(content)

            full_line = content[line_start:line_end].strip()

            # Skip if this looks like metadata or page headers
            if any(skip_word in full_line.lower() for skip_word in
                   ['page', 'signature', 'exhibit', 'index', 'table of contents']):
                continue

            # Skip very long lines that are probably not headers
            if len(full_line) > 500:
                continue

            # Extract section information
            groups = match.groups()

            if len(groups) >= 2 and groups[1]:
                section_id = groups[0].strip()
                section_title = groups[1].strip()
                # Clean up section title
                section_title = re.sub(r'\[TABLE_END\].*', '', section_title).strip()
                section_title = section_title.replace('|', '').strip()
            elif len(groups) >= 1:
                section_id = groups[0].strip()
                section_title = f"Section {section_id}" # Default title if no specific title captured
            else:
                section_id = 'unknown'
                section_title = full_line # Fallback to full line as title

            all_matches.append({
                'start_pos': line_start,
                'end_pos': line_end,
                'full_line': full_line,
                'section_id': section_id,
                'section_title': section_title,
                'pattern_idx': pattern_idx,
                'match_start': match.start()
            })

    # Remove duplicates - matches within 100 characters
    unique_matches = []
    for match in sorted(all_matches, key=lambda x: x['start_pos']):
        is_duplicate = any(
            abs(match['start_pos'] - existing['start_pos']) < 100
            for existing in unique_matches
        )
        if not is_duplicate:
            unique_matches.append(match)

    # Remove very similar section IDs (e.g., multiple "1" entries)
    final_matches = []
    seen_section_ids = set()
    for match in unique_matches:
        section_key = f"{match['section_id'].upper()}_{match['section_title'][:20]}"
        if section_key not in seen_section_ids:
            final_matches.append(match)
            seen_section_ids.add(section_key)

    print(f"🔍 Universal SEC detection found {len(final_matches)} unique sections:")
    for i, match in enumerate(final_matches[:15]):
        print(f"  {i+1}: Item/Part {match['section_id']} - {match['section_title'][:60]}...")

    # Convert to DocumentSection objects
    for i, match in enumerate(final_matches):
        start_pos = match['start_pos']
        end_pos = final_matches[i + 1]['start_pos'] if i + 1 < len(final_matches) else len(content)

        section_content = content[start_pos:end_pos].strip()

        # Determine section type and metadata
        section_id = match['section_id'].upper()

        if re.match(r'^[IVX]+$', section_id):
            # This is a PART
            section_type = 'part'
            part = f"PART {section_id}"
            item_number = None
            title = f"Part {section_id}"
            if match['section_title'] and match['section_title'] != f"Section {section_id}":
                title = f"Part {section_id} - {match['section_title']}"
        elif re.match(r'^\d+[A-C]?$', section_id):
            # This is an ITEM
            section_type = 'item'
            part = None
            item_number = section_id
            title = f"Item {section_id}"
            if match['section_title'] and match['section_title'] != f"Section {section_id}":
                title = f"Item {section_id} - {match['section_title']}"
        else:
            # Unknown section type
            section_type = 'content'
            part = None
            item_number = None
            title = match['section_title'] if match['section_title'] else match['full_line']

        sections.append(DocumentSection(
            title=title,
            content=section_content,
            section_type=section_type,
            item_number=item_number,
            part=part,
            start_pos=start_pos,
            end_pos=end_pos
        ))

    return sections

def detect_sections_from_toc_universal(content: str) -> List[DocumentSection]:
    """
    Extract sections from table of contents - works for any SEC filing
    """
    sections = []

    if not content: # ADD THIS CHECK
        print("Empty content provided to detect_sections_from_toc_universal")
        return sections

    # Look for table of contents patterns
    toc_patterns = [
        r'(?i)INDEX.*?(?=\\[PAGE BREAK\\])',
        r'(?i)TABLE OF CONTENTS.*?(?=\\[PAGE BREAK\\])',
        r'(?i)FORM 10-[KQ].*?INDEX.*?(?=\\[PAGE BREAK\\])', # Including INDEX as it's common
        r'(?i)\[TABLE_START\].*?Page.*?\\[TABLE_END\].*?(?=\\[PAGE BREAK\\])',
    ]

    toc_content = ""
    for pattern in toc_patterns:
        match = re.search(pattern, content, re.DOTALL)
        if match:
            toc_content = match.group(0)
            break

    if not toc_content:
        print("No table of contents found")
        return sections # Return empty list if no TOC found

    print(f"Found table of contents ({len(toc_content)} chars)")

    # ... (rest of the function remains the same)
    item_patterns = [
        # Standard table format: Item 1. | Business | Page
        r'(?i)Item\\s+(\\d{1,2}[A-C]?)\\.\\s*\\|\\s*([^|]+?)\\s*\\|\\s*\\d+',
        r'(?i)PART\\s+([IVX]+)\\s*\\|\\s*([^|]+)',
        # Alternative formats
        r'(?i)Item\\s+(\\d{1,2}[A-C]?)\\.\\s+([^|\\d]+)',
        r'(?i)(\\d{1,2}[A-C]?)\\.\\s*\\|\\s*([^|]+?)\\s*\\|\\s*\\d+',
    ]

    found_items = []
    # This loop also needs to handle empty toc_content if previous steps fail
    if not toc_content: # Defensive check
        return sections

    for pattern in item_patterns:
        for match in re.finditer(pattern, toc_content): # Use toc_content here
            groups = match.groups()
            if len(groups) >= 2:
                item_id = groups[0].strip()
                item_title = groups[1].strip()
                item_title = re.sub(r'\\s+', ' ', item_title)
                found_items.append((item_id, item_title))

    # ... (rest of the function for unique_items and DocumentSection creation)
    unique_items = []
    seen = set()
    for item_id, title in found_items:
        key = f"{item_id}_{title[:20]}"
        if key not in seen:
            unique_items.append((item_id, title))
            seen.add(key)

    print(f"Extracted {len(unique_items)} sections from table of contents:")
    for item_id, title in unique_items[:10]:
        print(f"  • {item_id}: {title[:50]}...")

    toc_sections = []
    for item_id, title in unique_items:
        section_type = 'item' if re.match(r'^\d+[A-C]?$', item_id) else ('part' if re.match(r'^[IVX]+$', item_id) else 'unknown')
        item_number = item_id if section_type == 'item' else None
        part_num = item_id if section_type == 'part' else None
        toc_sections.append(DocumentSection(
            title=title,
            content="",
            section_type=section_type,
            item_number=item_number,
            part=part_num
        ))
    return toc_sections

    # Remove duplicates
    unique_items = []
    seen = set()
    for item_id, title in found_items:
        key = f"{item_id}_{title[:20]}"
        if key not in seen:
            unique_items.append((item_id, title))
            seen.add(key)

    print(f"Extracted {len(unique_items)} sections from table of contents:")
    for item_id, title in unique_items[:10]:
        print(f"  • {item_id}: {title[:50]}...")

    # Convert extracted TOC items into DocumentSection objects
    toc_sections = []
    for item_id, title in unique_items:
        section_type = 'item' if re.match(r'^\d+[A-C]?$', item_id) else ('part' if re.match(r'^[IVX]+$', item_id) else 'unknown')
        item_number = item_id if section_type == 'item' else None
        part_num = item_id if section_type == 'part' else None
        toc_sections.append(DocumentSection(
            title=title,
            content="", # Content is empty, as it's just from TOC. Needs actual content extraction later.
            section_type=section_type,
            item_number=item_number,
            part=part_num
        ))
    return toc_sections

def detect_sections_robust_universal(content: str) -> List[DocumentSection]:
    """
    Universal robust section detection for all SEC filings.
    Prioritizes direct pattern matching, then TOC, then page-based.
    """
    logger.info("Attempting universal SEC section detection")

    # Strategy 1: Direct pattern matching for sections (more robust than previous strategy_1_improved on its own)
    sections_strategy1 = detect_sections_universal_sec(content)

    if len(sections_strategy1) >= 3: # A reasonable number of sections to consider it successful
        logger.info(f"Universal detection successful (Strategy 1): Found {len(sections_strategy1)} sections")
        return sections_strategy1

    # Strategy 2: Try parsing Table of Contents. If successful, use these as sections.
    # Note: This strategy only extracts titles/item_numbers, not content.
    # If this strategy is chosen, you would need a mechanism to map these titles back
    # to the main document to extract actual content.
    logger.warning("Direct detection found few sections, analyzing table of contents")
    sections_toc = detect_sections_from_toc_universal(content)

    if sections_toc and len(sections_toc) >= 3:
        logger.info(f"TOC analysis found {len(sections_toc)} potential sections. Will attempt to map content to them.")
        # This is where a sophisticated mapping from TOC titles to content blocks would go.
        # For simplicity in this structure, if TOC looks good, we try to use it to
        # derive actual content sections. A simple approach is to use regex matching
        # from the TOC titles to split the original document.

        # For this example, let's try a simple re-split based on TOC titles if available
        # This can be complex and might need further refinement based on document structure.
        # A quick way to leverage TOC is to combine with simple regex/page split
        # or use it to re-label existing sections.
        
        # Given the previous output, `detect_sections_universal_sec` seems to be the primary workhorse,
        # and TOC as a fallback/enhancement. Let's stick to the flow in the prompt where
        # if strategy 1 is not good, it tries strategy 2 (page-based), and if that's not good,
        # it just creates one big chunk.
        # The prompt's logging showed "No table of contents found" and then "All strategies failed".
        # This implies sections_toc is often empty or not used for content splitting.
        pass # The logic below will handle fallbacks if TOC is not used for primary sectioning.

    # Strategy 3: Page-based fallback (original strategy 2)
    logger.warning("Trying page-based detection as fallback")
    sections_strategy2 = detect_sections_strategy_2(content)

    if len(sections_strategy2) >= 2:
        logger.info(f"Page-based detection successful: Found {len(sections_strategy2)} sections")
        return sections_strategy2

    # Final fallback
    logger.warning("All strategies failed, creating single section")
    return [DocumentSection(
        title="Full Document",
        content=content,
        section_type='document',
        start_pos=0,
        end_pos=len(content)
    )]


# Helper function to extract metadata from filename
def extract_metadata_from_filename(file_path: str) -> FilingMetadata:
    filename = Path(file_path).name
    file_id = filename.replace(".txt", "")
    parts = file_id.split('_')

    if len(parts) != 3:
        # Fallback for malformed filenames, though process_filing_robust_universal checks this
        logger.warning(f"Malformed filename: {filename}. Using default metadata.")
        return FilingMetadata(
            ticker="UNKNOWN",
            form_type="UNKNOWN",
            filing_date="1900-01-01",
            fiscal_year=1900,
            fiscal_quarter=1,
            file_path=file_path
        )

    ticker, form_type, filing_date_str = parts

    try:
        filing_date = pd.to_datetime(filing_date_str)
        fiscal_year = filing_date.year
        fiscal_quarter = filing_date.quarter
    except pd.errors.ParserError:
        logger.error(f"Could not parse filing date from {filing_date_str} in {filename}. Using default values.")
        fiscal_year = 1900
        fiscal_quarter = 1

    if form_type == '10K' and filing_date.month < 4:
        fiscal_year -= 1

    return FilingMetadata(
        ticker=ticker,
        form_type=form_type,
        filing_date=filing_date_str,
        fiscal_year=fiscal_year,
        fiscal_quarter=fiscal_quarter,
        file_path=file_path
    )


# =============================================================================
# 7. MAIN PROCESSING FUNCTION (Universal)
# =============================================================================
def process_filing_robust_universal(file_path: str, target_tokens: int = 500, overlap_tokens: int = 100) -> List[Chunk]:
    """
    Universal processing function for all SEC filings
    """
    try:
        # Extract filing metadata
        # Moved metadata extraction here as it's robust and used early
        filing_metadata = extract_metadata_from_filename(file_path)
        filename = Path(file_path).name # For logging clarity
        file_id = filename.replace(".txt", "") # For chunk_id creation


        # Read and clean content
        with open(file_path, 'r', encoding='utf-8') as f:
            raw_content = f.read()
        cleaned_content = clean_sec_text(raw_content)

        # Use universal section detection
        sections = detect_sections_robust_universal(cleaned_content)
        logger.info(f"Found {len(sections)} sections in {filename}")

        # Process each section
        all_chunks = []
        chunk_counter = 0

        for section in sections:
            # Extract tables and narrative from this section's content
            tables_in_section, narrative_content_in_section = extract_and_process_tables(section.content)

            # Create section info string using the original create_section_info
            # This is the original, more robust create_section_info which expects DocumentSection
            section_info = create_section_info(section, filing_metadata.form_type)

            # Process tables found within this section
            for table in tables_in_section:
                chunk = Chunk(
                    chunk_id=f"{file_id}-chunk-{chunk_counter:04d}",
                    text=table['text'],
                    token_count=table['token_count'],
                    chunk_type='table',
                    section_info=section_info,
                    filing_metadata=filing_metadata,
                    chunk_index=chunk_counter,
                    has_overlap=False
                )
                all_chunks.append(chunk)
                chunk_counter += 1

            # Process narrative content from this section
            if narrative_content_in_section.strip():
                # Use the existing create_overlapping_chunks for narrative
                narrative_sub_chunks = create_overlapping_chunks(
                    narrative_content_in_section, target_tokens, overlap_tokens
                )

                for chunk_data in narrative_sub_chunks:
                    chunk = Chunk(
                        chunk_id=f"{file_id}-chunk-{chunk_counter:04d}",
                        text=chunk_data['text'],
                        token_count=chunk_data['token_count'],
                        chunk_type='narrative',
                        section_info=section_info,
                        filing_metadata=filing_metadata,
                        chunk_index=chunk_counter,
                        has_overlap=chunk_data['has_overlap']
                    )
                    all_chunks.append(chunk)
                    chunk_counter += 1

        logger.info(f"Created {len(all_chunks)} chunks for {filename}")
        return all_chunks

    except Exception as e:
        logger.error(f"Error processing {file_path}: {e}")
        return []


# =============================================================================
# 8. TESTING AND VALIDATION
# =============================================================================

def validate_chunks(chunks: List[Chunk]) -> Dict[str, Any]:
    """
    Validate the quality of our chunks
    """
    if not chunks:
        return {"error": "No chunks created"}

    token_counts = [chunk.token_count for chunk in chunks]

    stats = {
        "total_chunks": len(chunks),
        "avg_tokens": sum(token_counts) / len(token_counts),
        "min_tokens": min(token_counts),
        "max_tokens": max(token_counts),
        "chunks_with_overlap": sum(1 for chunk in chunks if chunk.has_overlap),
        "table_chunks": sum(1 for chunk in chunks if chunk.chunk_type == 'table'),
        "narrative_chunks": sum(1 for chunk in chunks if chunk.chunk_type == 'narrative'),
        "unique_sections": len(set(chunk.section_info for chunk in chunks))
    }

    return stats

# =============================================================================
# 9. LET'S TEST THIS!
# =============================================================================

print("🚀 SEC Filing Preprocessing Strategy - Ready for Testing!")
print("="*60)
print("Key improvements over original approach:")
print("✅ Multi-strategy section detection with fallbacks")
print("✅ Sentence-aware chunking with overlap")
print("✅ Robust error handling and logging")
print("✅ Structured data classes for better organization")
print("✅ Quality validation and statistics")
print("✅ Separate table and narrative processing")
print("="*60)


def test_single_file():
    """Test our preprocessing on a single file"""
    # Replace with an actual file path from your processed_filings directory
    test_file = "processed_filings/AAPL/AAPL_10K_2020-10-30.txt"

    if os.path.exists(test_file):
        print(f"🧪 Testing with: {test_file}")
        print("="*50)

        # Changed to universal processing function
        chunks = process_filing_robust_universal(test_file)
        stats = validate_chunks(chunks)

        print("📊 Processing Results:")
        for key, value in stats.items():
            print(f"  {key}: {value}")

        print("\n📝 Sample Chunks:")
        for i, chunk in enumerate(chunks[:3]):  # Show first 3 chunks
            print(f"\nChunk {i+1} ({chunk.chunk_type}):")
            print(f"  Section: {chunk.section_info}")
            print(f"  Tokens: {chunk.token_count}")
            print(f"  Text preview: {chunk.text[:200]}...")

        return chunks
    else:
        print(f"❌ File not found: {test_file}")
        print("Please update the file path to match your data structure")
        return []

# Run the test
chunks = test_single_file()

def compare_section_strategies(content: str): # Changed content_sample to content to use full content
    """Compare how different strategies perform"""
    print("🔍 Comparing Section Detection Strategies")
    print("="*50)

    # Strategy 1: Robust regex
    sections_1 = detect_sections_strategy_1_improved(content) # Changed content_sample to content
    print(f"Strategy 1 (Regex): {len(sections_1)} sections")
    for i, section in enumerate(sections_1[:5]):  # Show first 5
        print(f"  {i+1}. {section.title[:60]}...")

    print()

    # Strategy 2: Page-based fallback
    sections_2 = detect_sections_strategy_2(content) # Changed content_sample to content
    print(f"Strategy 2 (Page-based): {len(sections_2)} sections")
    for i, section in enumerate(sections_2[:5]):  # Show first 5
        print(f"  {i+1}. {section.title[:60]}...")

    return sections_1, sections_2

# Test if we have chunks from previous test
if chunks:
    # Use the first chunk's filing to get the full content
    test_file = chunks[0].filing_metadata.file_path
    with open(test_file, 'r', encoding='utf-8') as f:
        # Load full content for comparison, not just a sample
        full_content_for_comparison = f.read()
    cleaned_content_for_comparison = clean_sec_text(full_content_for_comparison) # Clean it for consistent comparison

    sections_1_comp, sections_2_comp = compare_section_strategies(cleaned_content_for_comparison)


def analyze_chunking_quality(chunks: List[Chunk]):
    """Deep dive into chunk quality"""
    if not chunks:
        print("No chunks to analyze")
        return

    print("📊 Chunking Quality Analysis")
    print("="*50)

    # Token distribution
    token_counts = [chunk.token_count for chunk in chunks]

    print(f"Token Distribution:")
    print(f"  Mean: {sum(token_counts)/len(token_counts):.1f}")
    print(f"  Median: {sorted(token_counts)[len(token_counts)//2]}")
    print(f"  Min: {min(token_counts)}")
    print(f"  Max: {max(token_counts)}")

    # Chunk types
    chunk_types = {}
    for chunk in chunks:
        chunk_types[chunk.chunk_type] = chunk_types.get(chunk.chunk_type, 0) + 1

    print(f"\nChunk Types:")
    for chunk_type, count in chunk_types.items():
        print(f"  {chunk_type}: {count}")

    # Section distribution
    sections_dist = {} # Renamed to avoid conflict with `sections` list
    for chunk in chunks:
        sections_dist[chunk.section_info] = sections_dist.get(chunk.section_info, 0) + 1

    print(f"\nSection Distribution:")
    for section, count in sorted(sections_dist.items()):
        print(f"  {section}: {count} chunks")

    # Overlap analysis
    overlap_count = sum(1 for chunk in chunks if chunk.has_overlap)
    print(f"\nOverlap Analysis:")
    print(f"  Chunks with overlap: {overlap_count}/{len(chunks)} ({overlap_count/len(chunks)*100:.1f}%)")

    return {
        'token_stats': {
            'mean': sum(token_counts)/len(token_counts),
            'median': sorted(token_counts)[len(token_counts)//2],
            'min': min(token_counts),
            'max': max(token_counts)
        },
        'chunk_types': chunk_types,
        'sections': sections_dist,
        'overlap_rate': overlap_count/len(chunks)
    }

# Analyze our test chunks
if chunks:
    quality_analysis = analyze_chunking_quality(chunks)


def test_chunking_parameters():
    """Test different parameter combinations"""
    if not chunks:
        print("No test file processed yet")
        return

    test_file = chunks[0].filing_metadata.file_path

    print("🔧 Testing Different Chunking Parameters")
    print("="*50)

    # Test different parameter combinations
    param_configs = [
        {"target_tokens": 300, "overlap_tokens": 50, "name": "Small chunks, low overlap"},
        {"target_tokens": 500, "overlap_tokens": 100, "name": "Medium chunks, medium overlap"},
        {"target_tokens": 800, "overlap_tokens": 150, "name": "Large chunks, high overlap"},
    ]

    results = {}

    for config in param_configs:
        print(f"\n🧪 Testing: {config['name']}")
        # Changed to universal processing function
        test_chunks = process_filing_robust_universal(
            test_file,
            target_tokens=config['target_tokens'],
            overlap_tokens=config['overlap_tokens']
        )

        stats = validate_chunks(test_chunks)
        results[config['name']] = stats

        print(f"  Total chunks: {stats['total_chunks']}")
        print(f"  Avg tokens: {stats['avg_tokens']:.1f}")
        print(f"  Overlap rate: {stats['chunks_with_overlap']}/{stats['total_chunks']}")

    return results

# Test different parameters
param_results = test_chunking_parameters()


def test_error_handling():
    """Test how our system handles various edge cases"""
    print("🛡️ Testing Error Handling")
    print("="*50)

    # Test 1: Non-existent file
    print("Test 1: Non-existent file")
    # Changed to universal processing function
    fake_chunks = process_filing_robust_universal("non_existent_file.txt")
    print(f"  Result: {len(fake_chunks)} chunks (expected 0)")

    # Test 2: Empty file
    print("\nTest 2: Empty content")
    empty_sections = detect_sections_robust_universal("") # Changed to universal detection
    print(f"  Result: {len(empty_sections)} sections")

    # Test 3: Malformed filename
    print("\nTest 3: Malformed filename")
    # Create a temporary file with bad name
    import tempfile
    with tempfile.NamedTemporaryFile(mode='w', suffix='_bad_name.txt', delete=False) as f:
        f.write("Some content")
        temp_file = f.name

    # Changed to universal processing function
    bad_chunks = process_filing_robust_universal(temp_file)
    print(f"  Result: {len(bad_chunks)} chunks (expected 0)")

    # Clean up
    os.unlink(temp_file)

    # Test 4: Very short text
    print("\nTest 4: Very short text")
    # This call is correct, as create_overlapping_chunks is a helper
    short_chunks = create_overlapping_chunks("Short text.", target_tokens=500)
    print(f"  Result: {len(short_chunks)} chunks")

test_error_handling()


def test_batch_processing(max_files: int = 5):
    """Test processing multiple files"""
    print(f"🔄 Testing Batch Processing (max {max_files} files)")
    print("="*50)

    data_path = "processed_filings/"
    if not os.path.exists(data_path):
        print(f"❌ Data path not found: {data_path}")
        return []

    # Get all files
    all_files = []
    for root, dirs, files in os.walk(data_path):
        for file in files:
            if file.endswith('.txt'):
                all_files.append(os.path.join(root, file))

    # Process a subset
    test_files = all_files[:max_files]
    print(f"Processing {len(test_files)} files...")

    all_results = []

    for i, file_path in enumerate(test_files):
        print(f"  {i+1}/{len(test_files)}: {os.path.basename(file_path)}")

        # Changed to universal processing function
        file_chunks = process_filing_robust_universal(file_path)
        stats = validate_chunks(file_chunks)

        all_results.append({
            'file': os.path.basename(file_path),
            'chunks': len(file_chunks),
            'avg_tokens': stats.get('avg_tokens', 0),
            'sections': stats.get('unique_sections', 0),
            'tables': stats.get('table_chunks', 0)
        })

    # Summary statistics
    print(f"\n📊 Batch Processing Summary:")
    total_chunks = sum(r['chunks'] for r in all_results)
    avg_chunks_per_file = total_chunks / len(all_results) if all_results else 0

    print(f"  Total files processed: {len(all_results)}\n")
    print(f"  Total chunks created: {total_chunks}")
    print(f"  Average chunks per file: {avg_chunks_per_file:.1f}")

    print(f"\n📋 Per-file results:")
    for result in all_results:
        print(f"  {result['file']}: {result['chunks']} chunks, {result['sections']} sections, {result['tables']} tables")

    return all_results

# Run batch test
batch_results = test_batch_processing(max_files=3)


def create_analysis_summary():
    """Create a comprehensive summary of our preprocessing"""
    print("📈 Final Analysis Summary")
    print("="*60)

    # Assumes 'chunks' variable from test_single_file() is available
    if 'chunks' not in globals() or not chunks:
        print("No chunks to analyze - run test_single_file() first")
        return

    # Create a mini dataset for analysis
    chunk_data = []
    for chunk in chunks:
        chunk_data.append({
            'chunk_id': chunk.chunk_id,
            'tokens': chunk.token_count,
            'type': chunk.chunk_type,
            'section': chunk.section_info,
            'has_overlap': chunk.has_overlap,
            'ticker': chunk.filing_metadata.ticker,
            'form_type': chunk.filing_metadata.form_type,
            'fiscal_year': chunk.filing_metadata.fiscal_year
        })

    df = pd.DataFrame(chunk_data)

    print("🎯 Key Insights:")
    print(f"  • Document: {df['ticker'].iloc[0]} {df['form_type'].iloc[0]} (FY{df['fiscal_year'].iloc[0]})")
    print(f"  • Total chunks: {len(df)}")
    print(f"  • Average chunk size: {df['tokens'].mean():.0f} tokens")
    print(f"  • Size range: {df['tokens'].min()} - {df['tokens'].max()} tokens")
    print(f"  • Overlap rate: {(df['has_overlap'].sum() / len(df) * 100):.1f}%")

    print(f"\n📊 Chunk Distribution by Type:")
    type_dist = df['type'].value_counts()
    for chunk_type, count in type_dist.items():
        percentage = (count / len(df)) * 100
        print(f"  • {chunk_type}: {count} chunks ({percentage:.1f}%)")

    print(f"\n📚 Section Breakdown:")
    section_dist = df['section'].value_counts()
    for section, count in section_dist.head(8).items():  # Top 8 sections
        print(f"  • {section}: {count} chunks")

    # Quality metrics
    print(f"\n✅ Quality Metrics:")

    # Check for very small chunks (potential issues)
    small_chunks = df[df['tokens'] < 50]
    print(f"  • Very small chunks (<50 tokens): {len(small_chunks)} ({len(small_chunks)/len(df)*100:.1f}%)")

    # Check for very large chunks (might need splitting)
    large_chunks = df[df['tokens'] > 800]
    print(f"  • Large chunks (>800 tokens): {len(large_chunks)} ({len(large_chunks)/len(df)*100:.1f}%)")

    # Check section coverage
    unique_sections = df['section'].nunique()
    print(f"  • Unique sections identified: {unique_sections}")

    # Show some example chunks for manual review
    print(f"\n🔍 Sample Chunks for Review:")

    # Show one of each type
    for chunk_type in df['type'].unique():
        sample = df[df['type'] == chunk_type].iloc[0]
        # Find the actual chunk object to get its full text
        chunk_obj = next(c for c in chunks if c.chunk_id == sample['chunk_id'])
        print(f"\n  {chunk_type.upper()} example ({sample['tokens']} tokens):")
        print(f"    Section: {sample['section']}")
        print(f"    Preview: {chunk_obj.text[:150]}...")

    return df

# Create final summary
summary_df = create_analysis_summary()


def compare_with_original():
    """Compare our approach with the original chunking strategy"""
    print("⚖️ Comparison: New vs Original Approach")
    print("="*60)

    improvements = [
        "✅ Multi-strategy section detection (fallbacks for robustness)",
        "✅ Sentence-aware chunking (preserves semantic boundaries)",
        "✅ Overlapping chunks (maintains context across boundaries)",
        "✅ Separate table processing (handles structured data better)",
        "✅ Comprehensive error handling (graceful degradation)",
        "✅ Rich metadata structure (better for search/filtering)",
        "✅ Quality validation (ensures chunk coherence)",
        "✅ Configurable parameters (tunable for different use cases)"
    ]

    potential_tradeoffs = [
        "⚠️ Slightly more complex code (but more maintainable)",
        "⚠️ More chunks due to overlap (but better retrieval)",
        "⚠️ Processing takes longer (but more robust results)"
    ]

    print("🚀 Key Improvements:")
    for improvement in improvements:
        print(f"  {improvement}")

    print(f"\n⚖️ Potential Tradeoffs:")
    for tradeoff in potential_tradeoffs:
        print(f"  {tradeoff}")

    print(f"\n🎯 Recommended Next Steps:")
    next_steps = [
        "1. Test on more diverse filings to validate robustness",
        "2. Fine-tune chunking parameters based on embedding performance",
        "3. Add semantic similarity checks between overlapping chunks",
        "4. Implement incremental processing for large datasets",
        "5. Add support for other SEC forms (8-K, DEF 14A, etc.)",
        "6. Create embedding quality metrics and evaluation"
    ]

    for step in next_steps:
        print(f"  {step}")

    print("\n" + "="*60)
    print("🎉 Preprocessing Strategy Testing Complete!")
    print("="*60)
    print("Next step: Convert this notebook into modular Python files")
    print("Then: Implement the embedding pipeline and MCP server!")
    print("="*60)

compare_with_original()

# Test the universal detection (adapted from your existing test_universal_detection)
# This part of the code was duplicated/re-defined in the ipynb.
# I'm placing the call here as it was in your original structure.
print("🚀 Ready to test universal SEC detection!")
print("\n1. Run test_universal_detection_fixed() to test all files")
print("2. Run compare_old_vs_universal_fixed() to see the improvement")
print("3. Run quick_pattern_test_fixed() to see what patterns match")

results_universal = test_universal_detection_fixed()
old_vs_new_sections = compare_old_vs_universal_fixed()
quick_pattern_test_fixed()

INFO:__main__:Attempting universal SEC section detection
ERROR:__main__:Error processing processed_filings/AAPL/AAPL_10K_2020-10-30.txt: unbalanced parenthesis at position 65


🚀 SEC Filing Preprocessing Strategy - Ready for Testing!
Key improvements over original approach:
✅ Multi-strategy section detection with fallbacks
✅ Sentence-aware chunking with overlap
✅ Robust error handling and logging
✅ Structured data classes for better organization
✅ Quality validation and statistics
✅ Separate table and narrative processing
🧪 Testing with: processed_filings/AAPL/AAPL_10K_2020-10-30.txt
🔍 Universal SEC detection found 0 unique sections:


ERROR:__main__:Error processing non_existent_file.txt: Unknown datetime string format, unable to parse: file, at position 0
INFO:__main__:Attempting universal SEC section detection
ERROR:__main__:Error processing /var/folders/pj/bmp5122d3d77bzq_cvf0wbl40000gn/T/tmparuw5yik_bad_name.txt: Unknown datetime string format, unable to parse: name, at position 0
INFO:__main__:Attempting universal SEC section detection
ERROR:__main__:Error processing processed_filings/AMZN/AMZN_10Q_2022-04-29.txt: unbalanced parenthesis at position 65
INFO:__main__:Attempting universal SEC section detection
ERROR:__main__:Error processing processed_filings/AMZN/AMZN_10Q_2020-05-01.txt: unbalanced parenthesis at position 65
INFO:__main__:Attempting universal SEC section detection
ERROR:__main__:Error processing processed_filings/AMZN/AMZN_10Q_2020-10-30.txt: unbalanced parenthesis at position 65


📊 Processing Results:
  error: No chunks created

📝 Sample Chunks:
No test file processed yet
🛡️ Testing Error Handling
Test 1: Non-existent file
  Result: 0 chunks (expected 0)

Test 2: Empty content
Empty content provided to detect_sections_universal_sec
Empty content provided to detect_sections_from_toc_universal
  Result: 1 sections

Test 3: Malformed filename
  Result: 0 chunks (expected 0)

Test 4: Very short text
  Result: 0 chunks
🔄 Testing Batch Processing (max 3 files)
Processing 3 files...
  1/3: AMZN_10Q_2022-04-29.txt
🔍 Universal SEC detection found 0 unique sections:
  2/3: AMZN_10Q_2020-05-01.txt
🔍 Universal SEC detection found 0 unique sections:
  3/3: AMZN_10Q_2020-10-30.txt
🔍 Universal SEC detection found 0 unique sections:

📊 Batch Processing Summary:
  Total files processed: 3

  Total chunks created: 0
  Average chunks per file: 0.0

📋 Per-file results:
  AMZN_10Q_2022-04-29.txt: 0 chunks, 0 sections, 0 tables
  AMZN_10Q_2020-05-01.txt: 0 chunks, 0 sections, 0 tabl

NameError: name 'test_universal_detection_fixed' is not defined

In [29]:
import os
import re
import pandas as pd
import tiktoken
from typing import List, Dict, Any, Tuple, Optional
from dataclasses import dataclass
from datetime import datetime
import logging
from pathlib import Path

# Set up logging to see what's happening
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Initialize tokenizer for accurate token counting
encoding = tiktoken.encoding_for_model("text-embedding-3-small")

# =============================================================================
# 1. SEC MAPPINGS WITH FALLBACKS
# =============================================================================

ITEM_NAME_MAP_10K = {
    "1": "Business",
    "1A": "Risk Factors",
    "1B": "Unresolved Staff Comments",
    "1C": "Cybersecurity",
    "2": "Properties",
    "3": "Legal Proceedings",
    "4": "Mine Safety Disclosures",
    "5": "Market for Registrant's Common Equity, Related Stockholder Matters and Issuer Purchases of Equity Securities",
    "6": "Reserved",
    "7": "Management's Discussion and Analysis of Financial Condition and Results of Operations",
    "7A": "Quantitative and Qualitative Disclosures About Market Risk",
    "8": "Financial Statements and Supplementary Data",
    "9": "Changes in and Disagreements With Accountants on Accounting and Financial Disclosure",
    "9A": "Controls and Procedures",
    "9B": "Other Information",
    "9C": "Disclosure Regarding Foreign Jurisdictions that Prevent Inspections",
    "10": "Directors, Executive Officers and Corporate Governance",
    "11": "Executive Compensation",
    "12": "Security Ownership of Certain Beneficial Owners and Management and Related Stockholder Matters",
    "13": "Certain Relationships and Related Transactions, and Director Independence",
    "14": "Principal Accountant Fees and Services",
    "15": "Exhibits, Financial Statement Schedules",
    "16": "Form 10-K Summary"
}

ITEM_NAME_MAP_10Q_PART_I = {
    "1": "Financial Statements",
    "2": "Management's Discussion and Analysis of Financial Condition and Results of Operations",
    "3": "Quantitative and Qualitative Disclosures About Market Risk",
    "4": "Controls and Procedures",
}

ITEM_NAME_MAP_10Q_PART_II = {
    "1": "Legal Proceedings", "1A": "Risk Factors",
    "2": "Unregistered Sales of Equity Securities and Use of Proceeds",
    "3": "Defaults Upon Senior Securities", "4": "Mine Safety Disclosures",
    "5": "Other Information", "6": "Exhibits",
}

# =============================================================================
# 2. DATA STRUCTURES FOR BETTER ORGANIZATION
# =============================================================================

@dataclass
class FilingMetadata:
    """Structured metadata for a filing"""
    ticker: str
    form_type: str
    filing_date: str
    fiscal_year: int
    fiscal_quarter: int
    file_path: str

@dataclass
class DocumentSection:
    """Represents a section of the document"""
    title: str
    content: str
    section_type: str  # 'item', 'part', 'intro', 'table'
    item_number: Optional[str] = None
    part: Optional[str] = None
    start_pos: int = 0
    end_pos: int = 0

@dataclass
class Chunk:
    """Final chunk with all metadata"""
    chunk_id: str
    text: str
    token_count: int
    chunk_type: str  # 'narrative', 'table', 'mixed'
    section_info: str
    filing_metadata: FilingMetadata
    chunk_index: int
    has_overlap: bool = False

# =============================================================================
# 3. ROBUST TEXT CLEANING
# =============================================================================

def clean_sec_text(text: str) -> str:
    """
    Clean SEC filing text more robustly
    """
    # Remove common SEC artifacts
    text = re.sub(r'UNITED STATES\s+SECURITIES AND EXCHANGE COMMISSION.*?FORM \d+[A-Z]*', '', text, flags=re.DOTALL | re.IGNORECASE)

    # Handle page breaks more intelligently
    text = text.replace('[PAGE BREAK]', '\n\n--- PAGE BREAK ---\n\n')

    # Preserve table boundaries but clean them up
    text = re.sub(r'\[TABLE_START\]', '\n\n=== TABLE START ===\n', text)
    text = re.sub(r'\[TABLE_END\]', '\n=== TABLE END ===\n\n', text)

    # Clean up excessive whitespace but preserve paragraph structure
    text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text)  # Multiple newlines -> double newline
    text = re.sub(r'[ \t]+', ' ', text)  # Multiple spaces/tabs -> single space
    text = re.sub(r'^\s+|\s+$', '', text, flags=re.MULTILINE)  # Trim lines

    return text.strip()

# =============================================================================
# 4. MULTI-STRATEGY SECTION DETECTION
# =============================================================================

def detect_sections_strategy_1_improved(content: str) -> List[DocumentSection]:
    """
    Improved Strategy 1: Patterns based on real SEC filing structure
    """
    sections = []

    # Much more comprehensive patterns based on your actual files
    patterns = [
        # PART patterns - handle various formats
        re.compile(r'^\s*PART\s+([IVX]+)(?:\s*[-–—].*?)?$', re.I | re.M),
        re.compile(r'^PART\s+([IVX]+)(?:\s*[-–—].*?)?$', re.I | re.M),

        # ITEM patterns - much more flexible
        re.compile(r'^\s*ITEM\s+(\d{1,2}[A-C]?)(?:[.\s–—])', re.I | re.M),
        re.compile(r'^ITEM\s+(\d{1,2}[A-C]?)(?:[.\s–—])', re.I | re.M),
        re.compile(r'Item\s+(\d{1,2}[A-C]?)(?:[.\s–—])', re.I | re.M),

        # Number-dot format common in SEC filings
        re.compile(r'^(\d{1,2}[A-C]?)\.\s+[A-Z][A-Za-z\s]{10,}', re.I | re.M),

        # Content-based patterns for known sections
        re.compile(r'^.{0,50}(BUSINESS)\s*$', re.I | re.M),
        re.compile(r'^.{0,50}(RISK FACTORS)\s*$', re.I | re.M),
        re.compile(r'^.{0,50}(LEGAL PROCEEDINGS)\s*$', re.I | re.M),
        re.compile(r'^.{0,50}(FINANCIAL STATEMENTS)\s*$', re.I | re.M),
        re.compile(r'^.{0,50}(MANAGEMENT.S DISCUSSION)\s*', re.I | re.M),
        re.compile(r'^.{0,50}(PROPERTIES)\s*$', re.I | re.M),
        re.compile(r'^.{0,50}(CONTROLS AND PROCEDURES)\s*$', re.I | re.M),
    ]

    all_matches = []

    # Process each pattern
    for pattern_idx, pattern in enumerate(patterns):
        for match in pattern.finditer(content): # Use pre-compiled pattern
            # Get the full line containing this match
            line_start = content.rfind('\n', 0, match.start()) + 1
            line_end = content.find('\n', match.end())
            if line_end == -1:
                line_end = len(content)

            full_line = content[line_start:line_end].strip()

            # Filter out obvious false positives
            if (len(full_line) > 400 or  # Too long to be a header
                len(full_line) < 3 or    # Too short
                '|' in full_line or      # Likely table content
                full_line.count(' ') > 20):  # Too many words
                continue

            # Extract section identifier
            section_id = match.group(1) if match.groups() else 'unknown'

            all_matches.append({
                'start_pos': line_start,
                'end_pos': line_end,
                'full_line': full_line,
                'section_id': section_id,
                'pattern_idx': pattern_idx,
                'match_start': match.start()
            })

    # Remove duplicates - matches within 200 characters of each other
    unique_matches = []
    for match in sorted(all_matches, key=lambda x: x['start_pos']):
        is_duplicate = any(
            abs(match['start_pos'] - existing['start_pos']) < 200
            for existing in unique_matches
        )
        if not is_duplicate:
            unique_matches.append(match)

    # Debug output
    print(f"🔍 Improved detection found {len(unique_matches)} potential sections:")
    for i, match in enumerate(unique_matches[:15]):  # Show more for debugging
        print(f"  {i+1}: {match['full_line'][:80]}...")

    # Convert to DocumentSection objects
    for i, match in enumerate(unique_matches):
        start_pos = match['start_pos']
        end_pos = unique_matches[i + 1]['start_pos'] if i + 1 < len(unique_matches) else len(content)

        section_content = content[start_pos:end_pos].strip()

        # Determine section type and metadata
        full_line_upper = match['full_line'].upper()
        section_id = match['section_id'].upper() if match['section_id'] != 'unknown' else None

        if 'PART' in full_line_upper and section_id:
            section_type = 'part'
            part = f"PART {section_id}"
            item_number = None
            title = f"Part {section_id}"
        elif ('ITEM' in full_line_upper or re.match(r'^\d+[A-C]?$', str(section_id))) and section_id:
            section_type = 'item'
            part = None
            item_number = section_id
            title = f"Item {section_id}"
        elif any(keyword in full_line_upper for keyword in
                ['BUSINESS', 'RISK', 'LEGAL', 'FINANCIAL', 'MANAGEMENT', 'PROPERTIES', 'CONTROLS']):
            section_type = 'named_section'
            part = None
            item_number = None
            title = match['full_line']
        else:
            section_type = 'content'
            part = None
            item_number = None
            title = match['full_line']

        sections.append(DocumentSection(
            title=title,
            content=section_content,
            section_type=section_type,
            item_number=item_number,
            part=part,
            start_pos=start_pos,
            end_pos=end_pos
        ))

    return sections

def detect_sections_strategy_2(content: str) -> List[DocumentSection]:
    """
    Strategy 2: Fallback using page breaks and heuristics
    """
    sections = []

    # Split by page breaks first
    pages = content.split('--- PAGE BREAK ---')

    current_section = ""
    current_title = "Document Content"

    for i, page in enumerate(pages):
        page = page.strip()
        if not page:
            continue

        # Look for section headers in the page
        lines = page.split('\n')
        potential_headers = []

        for j, line in enumerate(lines[:10]):  # Check first 10 lines of each page
            line = line.strip()
            if (len(line) < 100 and  # Headers are usually short
                (re.search(r'\b(ITEM|PART)\b', line, re.IGNORECASE) or
                 re.search(r'\b(BUSINESS|RISK FACTORS|FINANCIAL STATEMENTS)\b', line, re.IGNORECASE))):
                potential_headers.append((j, line))

        if potential_headers:
            # Found a header, start new section
            if current_section:
                sections.append(DocumentSection(
                    title=current_title,
                    content=current_section.strip(),
                    section_type='content',
                    start_pos=0,
                    end_pos=len(current_section)
                ))

            current_title = potential_headers[0][1]
            current_section = page
        else:
            # Continue current section
            current_section += "\n\n" + page

    # Add the last section
    if current_section:
        sections.append(DocumentSection(
            title=current_title,
            content=current_section.strip(),
            section_type='content',
            start_pos=0,
            end_pos=len(current_section)
        ))

    return sections

# The `detect_sections_robust` function from your original code (renamed detect_sections_robust_old to avoid conflict)
def detect_sections_robust_old(content: str) -> List[DocumentSection]:
    """
    Multi-strategy section detection with fallbacks (original version)
    """
    logger.info("Attempting Strategy 1: Regex-based section detection")
    sections = detect_sections_strategy_1_improved(content) # Original called detect_sections_strategy_1, updated to _improved

    if len(sections) >= 3:  # A reasonable number of sections to consider it successful
        logger.info(f"Strategy 1 successful: Found {len(sections)} sections")
        return sections

    logger.warning("Strategy 1 failed, trying Strategy 2: Page-based detection")
    sections = detect_sections_strategy_2(content)

    if len(sections) >= 2:
        logger.info(f"Strategy 2 successful: Found {len(sections)} sections")
        return sections

    logger.warning("All strategies failed, creating single section")
    return [DocumentSection(
        title="Full Document",
        content=content,
        section_type='document',
        start_pos=0,
        end_pos=len(content)
    )]

def create_section_info(section: DocumentSection, form_type: str) -> str:
    """
    Create human-readable section information
    """
    if section.section_type == 'item' and section.item_number:
        if form_type == '10K':
            item_name = ITEM_NAME_MAP_10K.get(section.item_number, "Unknown Section")
            return f"Item {section.item_number} - {item_name}"
        elif form_type == '10Q':
            # Determine which part this item belongs to
            if section.item_number in ITEM_NAME_MAP_10Q_PART_I:
                item_name = ITEM_NAME_MAP_10Q_PART_I[section.item_number]
                return f"Part I, Item {section.item_number} - {item_name}"
            else:
                item_name = ITEM_NAME_MAP_10Q_PART_II.get(section.item_number, "Unknown Section")
                return f"Part II, Item {section.item_number} - {item_name}"

    elif section.section_type == 'part' and section.part:
        return section.part

    else:
        return section.title or "Document Content"

def detect_sections_universal_sec(content: str) -> List[DocumentSection]:
    """
    Universal section detection for SEC filings with table-based formatting
    """
    sections = []

    if not content: # Added check for empty content
        logger.info("Empty content provided to detect_sections_universal_sec. Returning empty sections.")
        return sections

    # Universal patterns for table-formatted SEC filings
    patterns = [
        # Table-based ITEM patterns (most common in your files)
        re.compile(r'(?i)\[TABLE_START\]\s*Item\s+(\d{1,2}[A-C]?)\.\s*\|\s*([^\[]+?)\s*\[TABLE_END\]', re.DOTALL),
        re.compile(r'(?i)\[TABLE_START\]\s*Item\s+(\d{1,2}[A-C]?)\.\s*\|\s*([^|]+)', re.DOTALL),

        # Table-based PART patterns
        re.compile(r'(?i)\[TABLE_START\]\s*PART\s+([IVX]+)\s*\|\s*([^\[]+?)\s*\[TABLE_END\]', re.DOTALL),
        re.compile(r'(?i)\[TABLE_START\]\s*PART\s+([IVX]+)\s*\|\s*([^|]+)', re.DOTALL),
        re.compile(r'(?i)\[TABLE_START\]\s*PART\s+([IVX]+)\s*\[TABLE_END\]', re.DOTALL),

        # Standalone ITEM patterns (fallback)
        re.compile(r'(?i)^\s*Item\s+(\d{1,2}[A-C]?)\.\s*([^\n]+)', re.M),
        re.compile(r'(?i)Item\s+(\d{1,2}[A-C]?)\.\s*\|\s*([^|]+)', re.DOTALL),

        # Standalone PART patterns (fallback)
        re.compile(r'(?i)^\s*PART\s+([IVX]+)\s*([^\n]*)', re.M),
        re.compile(r'(?i)PART\s+([IVX]+)\s*\|\s*([^|]+)', re.DOTALL),

        # Number-only patterns in tables
        re.compile(r'(?i)\[TABLE_START\]\s*(\d{1,2}[A-C]?)\.\s*\|\s*([^|]+)', re.DOTALL),

        # Headers that might appear standalone
        re.compile(r'(?i)^(Item\\s+\\d{1,2}[A-C]?\\.\\s+[^|]+?)$', re.M),
        re.compile(r'^(PART\\s+[IVX]+)(?:\\s*[-–—]\\s*(.+))?$', re.M),
    ]

    all_matches = []

    for pattern_idx, pattern in enumerate(patterns):
        for match in pattern.finditer(content): # Use pre-compiled pattern
            # Get context around the match
            line_start = content.rfind('\n', 0, match.start()) + 1
            line_end = content.find('\n', match.end())
            if line_end == -1:
                line_end = len(content)

            full_line = content[line_start:line_end].strip()

            # Filter out obvious false positives
            if (len(full_line) > 400 or  # Too long to be a header
                len(full_line) < 3 or    # Too short
                '|' in full_line or      # Likely table content
                full_line.count(' ') > 20):  # Too many words
                continue

            # Extract section information
            groups = match.groups()

            if len(groups) >= 2 and groups[1]:
                section_id = groups[0].strip()
                section_title = groups[1].strip()
                # Clean up section title
                section_title = re.sub(r'\[TABLE_END\].*', '', section_title).strip()
                section_title = section_title.replace('|', '').strip()
            elif len(groups) >= 1:
                section_id = groups[0].strip()
                section_title = f"Section {section_id}" # Default title if no specific title captured
            else:
                section_id = 'unknown'
                section_title = full_line # Fallback to full line as title

            all_matches.append({
                'start_pos': match.start(), # Use match.start() for more precise position of regex match
                'end_pos': match.end(),     # Use match.end()
                'full_line': full_line,
                'section_id': section_id,
                'section_title': section_title,
                'pattern_idx': pattern_idx,
                'match_start': match.start()
            })

    # Sort matches by their start position
    all_matches.sort(key=lambda x: x['start_pos'])

    # Remove duplicates - matches within 100 characters (adjusted logic for overlap)
    unique_matches = []
    if all_matches:
        unique_matches.append(all_matches[0])
        for i in range(1, len(all_matches)):
            # If current match is far enough from previous unique match, add it
            if all_matches[i]['start_pos'] - unique_matches[-1]['start_pos'] > 100:
                unique_matches.append(all_matches[i])
            # Else, if the new match is a 'better' match (e.g., lower pattern_idx indicates higher priority pattern)
            # This logic can be more complex, but for now a simple distance check is good.
            # Or if it's a "cleaner" item/part header over a generic one, replace
            elif all_matches[i]['section_id'] != 'unknown' and unique_matches[-1]['section_id'] == 'unknown':
                 unique_matches[-1] = all_matches[i]

    # Further refine unique matches by avoiding similar section IDs too close if not from primary patterns
    final_matches = []
    seen_primary_sections = {} # For "Item X" and "Part Y" that are typically unique
    for match_data in unique_matches:
        section_id_upper = match_data['section_id'].upper()
        if re.match(r'^(ITEM|PART)\s', match_data['full_line'].upper()): # If it's a strong ITEM/PART header
            if section_id_upper not in seen_primary_sections:
                final_matches.append(match_data)
                seen_primary_sections[section_id_upper] = match_data['start_pos']
            else:
                # If a primary section ID is repeated, pick the earlier one or a "better" one
                # For simplicity, we just keep the first one found unless a significant distance
                if match_data['start_pos'] - seen_primary_sections[section_id_upper] > 500: # New occurrence of the same item far away
                     final_matches.append(match_data)
                     seen_primary_sections[section_id_upper] = match_data['start_pos']
        else: # For other types of matches, add if not too close to the last added match
            if not final_matches or (match_data['start_pos'] - final_matches[-1]['start_pos'] > 100):
                final_matches.append(match_data)

    print(f"🔍 Universal SEC detection found {len(final_matches)} unique sections:")
    for i, match in enumerate(final_matches[:15]):
        print(f"  {i+1}: Item/Part {match['section_id']} - {match['section_title'][:60]}...")

    # Convert to DocumentSection objects
    # This loop requires calculating end_pos based on the *next* detected section's start_pos
    final_document_sections = []
    for i, match in enumerate(final_matches):
        start_pos = match['start_pos']
        # End position is the start of the next matched section, or end of content if it's the last one
        end_pos = final_matches[i + 1]['start_pos'] if i + 1 < len(final_matches) else len(content)

        section_content = content[start_pos:end_pos].strip()

        # Determine section type and metadata
        section_id = match['section_id'].upper()
        title = match['section_title'] # Use the extracted section title

        section_type = 'content' # Default type
        item_number = None
        part = None

        if re.match(r'^[IVX]+$', section_id):
            section_type = 'part'
            part = f"PART {section_id}"
            if title == f"Section {section_id}": # If it's just a generic "Section X" title
                title = part # Use the standardized PART title
            elif not title: # If no specific title was captured
                 title = part
        elif re.match(r'^\d+[A-C]?$', section_id):
            section_type = 'item'
            item_number = section_id
            if title == f"Section {section_id}": # If it's just a generic "Section X" title
                title = f"Item {item_number}" # Use the standardized ITEM title
            elif not title: # If no specific title was captured
                 title = f"Item {item_number}"

        final_document_sections.append(DocumentSection(
            title=title,
            content=section_content,
            section_type=section_type,
            item_number=item_number,
            part=part,
            start_pos=start_pos,
            end_pos=end_pos
        ))

    return final_document_sections

def detect_sections_from_toc_universal(content: str) -> List[DocumentSection]:
    """
    Extract sections from table of contents - works for any SEC filing.
    This function primarily identifies section titles and item numbers from TOC,
    but does not extract their content directly.
    """
    sections = []

    if not content: # Added check for empty content
        logger.info("Empty content provided to detect_sections_from_toc_universal. Returning empty sections.")
        return sections

    # Look for table of contents patterns
    toc_patterns = [
        re.compile(r'(?i)INDEX.*?(?=\\[PAGE BREAK\\])', re.DOTALL),
        re.compile(r'(?i)TABLE OF CONTENTS.*?(?=\\[PAGE BREAK\\])', re.DOTALL),
        re.compile(r'(?i)FORM 10-[KQ].*?INDEX.*?(?=\\[PAGE BREAK\\])', re.DOTALL), # Common variant
        re.compile(r'(?i)\[TABLE_START\].*?Page.*?\\[TABLE_END\].*?(?=\\[PAGE BREAK\\])', re.DOTALL),
    ]

    toc_content = ""
    for pattern in toc_patterns:
        match = pattern.search(content) # Use pre-compiled pattern
        if match:
            toc_content = match.group(0)
            break

    if not toc_content:
        logger.warning("No table of contents found in detect_sections_from_toc_universal.")
        return sections # Return empty list if no TOC found

    logger.info(f"Found table of contents ({len(toc_content)} chars)")

    # Define patterns for items/parts within the TOC
    item_patterns = [
        re.compile(r'(?i)Item\\s+(\\d{1,2}[A-C]?)\\.\\s*\\|\\s*([^|]+?)\\s*\\|\\s*\\d+', re.DOTALL), # Standard table format
        re.compile(r'(?i)PART\\s+([IVX]+)\\s*\\|\\s*([^|]+)', re.DOTALL), # Part in table
        re.compile(r'(?i)Item\\s+(\\d{1,2}[A-C]?)\\.\\s+([^|\\d]+)', re.M), # Standalone Item line
        re.compile(r'(?i)(\\d{1,2}[A-C]?)\\.\\s*\\|\\s*([^|]+?)\\s*\\|\\s*\\d+', re.DOTALL), # Number-dot item in table
        re.compile(r'(?i)PART\\s+([IVX]+)', re.M) # Simple PART line
    ]

    found_items = []
    for pattern in item_patterns:
        for match in pattern.finditer(toc_content): # Use pre-compiled pattern and toc_content
            groups = match.groups()
            if len(groups) >= 2: # Pattern captured both ID and Title
                item_id = groups[0].strip()
                item_title = groups[1].strip()
                item_title = re.sub(r'\\s+', ' ', item_title) # Normalize whitespace
                found_items.append((item_id, item_title))
            elif len(groups) == 1: # Pattern only captured ID (e.g., simple PART)
                item_id = groups[0].strip()
                # Attempt to get text immediately following the item_id if available, otherwise use a generic title
                # This makes the TOC parsing more robust for less structured TOCs.
                remaining_text = toc_content[match.end():].split('\n')[0].strip()
                if remaining_text:
                    item_title = remaining_text
                else:
                    item_title = f"Section {item_id}"
                item_title = re.sub(r'\\s+', ' ', item_title)
                found_items.append((item_id, item_title))


    # Remove duplicates
    unique_items = []
    seen = set()
    for item_id, title in found_items:
        key = f"{item_id}_{title[:50]}" # Use a longer slice for better uniqueness
        if key not in seen:
            unique_items.append((item_id, title))
            seen.add(key)

    logger.info(f"Extracted {len(unique_items)} sections from table of contents:")
    for item_id, title in unique_items[:10]:
        logger.info(f"  • {item_id}: {title[:50]}...")

    toc_sections = []
    for item_id, title in unique_items:
        section_type = 'unknown'
        item_number = None
        part_num = None

        if re.match(r'^\d+[A-C]?$', item_id):
            section_type = 'item'
            item_number = item_id
        elif re.match(r'^[IVX]+$', item_id):
            section_type = 'part'
            part_num = item_id

        toc_sections.append(DocumentSection(
            title=title,
            content="", # Content is intentionally empty here; will be filled by main sectioning if this strategy is chosen.
            section_type=section_type,
            item_number=item_number,
            part=part_num
        ))
    return toc_sections


def detect_sections_robust_universal(content: str) -> List[DocumentSection]:
    """
    Universal robust section detection for all SEC filings.
    Prioritizes direct pattern matching (which handles tables well), then TOC, then page-based.
    """
    logger.info("Attempting universal SEC section detection")

    # Strategy 1: Direct pattern matching for sections (designed to work well with common SEC patterns)
    sections_strategy1 = detect_sections_universal_sec(content)

    if len(sections_strategy1) >= 3: # A reasonable number of sections to consider it successful
        logger.info(f"Universal detection successful (Strategy 1): Found {len(sections_strategy1)} sections.")
        return sections_strategy1

    # Strategy 2: Try parsing Table of Contents. If successful and provides many sections,
    # it indicates a structured document. We'll use these titles and re-scan the document
    # for their content.
    logger.warning("Direct detection found few sections, analyzing table of contents.")
    toc_entries = detect_sections_from_toc_universal(content) # These are DocumentSections with only title/metadata, no content

    if toc_entries and len(toc_entries) >= 3: # If TOC parsing yielded a good number of entries
        logger.info(f"TOC analysis found {len(toc_entries)} potential sections. Attempting to extract content based on TOC titles.")
        # This is a critical step: we need to use the TOC titles to find the actual content.
        # A robust way is to dynamically build regex patterns from the TOC titles
        # and search for them in the main content.

        combined_sections = []
        last_end_pos = 0

        # Create regex patterns from TOC titles, prioritizing item/part numbers if available
        # Example: "Item 1A. Risk Factors" or "PART I. FINANCIAL INFORMATION"
        patterns_from_toc = []
        for entry in toc_entries:
            if entry.item_number:
                patterns_from_toc.append(re.escape(f"Item {entry.item_number}").replace('\\ ', '\\s*'))
            elif entry.part:
                patterns_from_toc.append(re.escape(f"PART {entry.part}").replace('\\ ', '\\s*'))
            else:
                patterns_from_toc.append(re.escape(entry.title).replace('\\ ', '\\s*')) # Escape for regex use

        # Combine patterns with flexible whitespace and make sure they match start of line
        full_pattern = '|'.join(f'(?:^\\s*{p})' for p in patterns_from_toc if p)
        if full_pattern:
            compiled_full_pattern = re.compile(full_pattern, re.I | re.M)
            
            matches = list(compiled_full_pattern.finditer(content))
            
            for i, match in enumerate(matches):
                start_pos = match.start()
                end_pos = matches[i+1].start() if i+1 < len(matches) else len(content)
                
                section_content = content[start_pos:end_pos].strip()
                
                # Try to map this content block back to a TOC entry
                # This is heuristic and might need refinement
                matched_toc_entry = None
                for entry in toc_entries:
                    if entry.item_number and f"Item {entry.item_number}".upper() in match.group(0).upper():
                        matched_toc_entry = entry
                        break
                    elif entry.part and f"PART {entry.part}".upper() in match.group(0).upper():
                        matched_toc_entry = entry
                        break
                    elif entry.title.upper() in match.group(0).upper():
                        matched_toc_entry = entry
                        break
                
                if matched_toc_entry:
                    combined_sections.append(DocumentSection(
                        title=matched_toc_entry.title,
                        content=section_content,
                        section_type=matched_toc_entry.section_type,
                        item_number=matched_toc_entry.item_number,
                        part=matched_toc_entry.part,
                        start_pos=start_pos,
                        end_pos=end_pos
                    ))
                else: # Fallback if TOC entry not clearly matched
                    title = content[start_pos:content.find('\n', start_pos)].strip() or "Unknown Section"
                    combined_sections.append(DocumentSection(
                        title=title,
                        content=section_content,
                        section_type='content',
                        start_pos=start_pos,
                        end_pos=end_pos
                    ))
            
            if len(combined_sections) >= 3:
                logger.info(f"Universal detection successful (TOC-based): Found {len(combined_sections)} sections.")
                return combined_sections
        else:
            logger.warning("No valid patterns generated from TOC for content mapping.")

    # Strategy 3: Page-based fallback (original strategy 2)
    logger.warning("Trying page-based detection as fallback.")
    sections_strategy2 = detect_sections_strategy_2(content)

    if len(sections_strategy2) >= 2:
        logger.info(f"Page-based detection successful: Found {len(sections_strategy2)} sections.")
        return sections_strategy2

    # Final fallback: return the entire document as a single section
    logger.warning("All strategies failed, creating single section.")
    return [DocumentSection(
        title="Full Document",
        content=content,
        section_type='document',
        start_pos=0,
        end_pos=len(content)
    )]


# Helper function to extract metadata from filename
def extract_metadata_from_filename(file_path: str) -> FilingMetadata:
    filename = Path(file_path).name
    file_id = filename.replace(".txt", "")
    parts = file_id.split('_')

    if len(parts) != 3:
        # Fallback for malformed filenames, though process_filing_robust_universal checks this
        logger.warning(f"Malformed filename: {filename}. Using default metadata.")
        return FilingMetadata(
            ticker="UNKNOWN",
            form_type="UNKNOWN",
            filing_date="1900-01-01",
            fiscal_year=1900,
            fiscal_quarter=1,
            file_path=file_path
        )

    ticker, form_type, filing_date_str = parts

    try:
        filing_date = pd.to_datetime(filing_date_str)
        fiscal_year = filing_date.year
        fiscal_quarter = filing_date.quarter
    except pd.errors.ParserError:
        logger.error(f"Could not parse filing date from {filing_date_str} in {filename}. Using default values.")
        fiscal_year = 1900
        fiscal_quarter = 1

    # Adjust fiscal year for 10-K filings if the filing date is early in the calendar year
    # and typically refers to the previous fiscal year end.
    if form_type == '10K' and filing_date.month <= 3: # Assuming fiscal year ends typically in Dec or Jan-Mar for previous year
        fiscal_year -= 1 # Often a 10K filed in Jan-Mar of current year is for previous fiscal year

    return FilingMetadata(
        ticker=ticker,
        form_type=form_type,
        filing_date=filing_date_str,
        fiscal_year=fiscal_year,
        fiscal_quarter=fiscal_quarter,
        file_path=file_path
    )


# =============================================================================
# MAIN PROCESSING FUNCTION (Universal)
# =============================================================================
def process_filing_robust_universal(file_path: str, target_tokens: int = 500, overlap_tokens: int = 100) -> List[Chunk]:
    """
    Universal processing function for all SEC filings
    """
    try:
        # Extract filing metadata
        filing_metadata = extract_metadata_from_filename(file_path)
        filename = Path(file_path).name # For logging clarity
        file_id = filename.replace(".txt", "") # For chunk_id creation

        # Read and clean content
        with open(file_path, 'r', encoding='utf-8') as f:
            raw_content = f.read()
        cleaned_content = clean_sec_text(raw_content)

        # Basic check for empty content after cleaning
        if not cleaned_content.strip():
            logger.warning(f"Cleaned content for {filename} is empty. No chunks created.")
            return []

        # Use universal section detection
        sections = detect_sections_robust_universal(cleaned_content)
        logger.info(f"Found {len(sections)} sections in {filename}")

        # Process each section
        all_chunks = []
        chunk_counter = 0

        for section in sections:
            # Extract tables and narrative from this section's content
            # Ensure section.content is not empty before processing
            if not section.content.strip():
                continue # Skip empty sections

            tables_in_section, narrative_content_in_section = extract_and_process_tables(section.content)

            # Create section info string using the original create_section_info
            section_info = create_section_info(section, filing_metadata.form_type)

            # Process tables found within this section
            for table in tables_in_section:
                chunk = Chunk(
                    chunk_id=f"{file_id}-chunk-{chunk_counter:04d}",
                    text=table['text'],
                    token_count=table['token_count'],
                    chunk_type='table',
                    section_info=section_info,
                    filing_metadata=filing_metadata,
                    chunk_index=chunk_counter,
                    has_overlap=False
                )
                all_chunks.append(chunk)
                chunk_counter += 1

            # Process narrative content from this section
            if narrative_content_in_section.strip():
                # Use the existing create_overlapping_chunks for narrative
                narrative_sub_chunks = create_overlapping_chunks(
                    narrative_content_in_section, target_tokens, overlap_tokens
                )

                for chunk_data in narrative_sub_chunks:
                    chunk = Chunk(
                        chunk_id=f"{file_id}-chunk-{chunk_counter:04d}",
                        text=chunk_data['text'],
                        token_count=chunk_data['token_count'],
                        chunk_type='narrative',
                        section_info=section_info,
                        filing_metadata=filing_metadata,
                        chunk_index=chunk_counter,
                        has_overlap=chunk_data['has_overlap']
                    )
                    all_chunks.append(chunk)
                    chunk_counter += 1

        logger.info(f"Created {len(all_chunks)} chunks for {filename}")
        return all_chunks

    except Exception as e:
        logger.error(f"Error processing {file_path}: {e}")
        return []


# =============================================================================
# 5. IMPROVED SENTENCE-AWARE CHUNKING
# =============================================================================

def split_into_sentences(text: str) -> List[str]:
    """
    Split text into sentences using multiple heuristics
    """
    # Simple sentence splitting (can be improved with spaCy/NLTK)
    sentences = re.split(r'(?<=[.!?])\s+(?=[A-Z])', text)

    # Clean up sentences
    sentences = [s.strip() for s in sentences if s.strip()]

    return sentences

def create_overlapping_chunks(text: str, target_tokens: int = 500, overlap_tokens: int = 100,
                            min_tokens: int = 50) -> List[Dict[str, Any]]:
    """
    Create semantically aware chunks with overlap
    """
    sentences = split_into_sentences(text)
    chunks = []

    current_chunk_sentences = []
    current_tokens = 0

    for i, sentence in enumerate(sentences):
        sentence_tokens = len(encoding.encode(sentence))

        # If adding this sentence exceeds target, finalize current chunk
        if current_tokens + sentence_tokens > target_tokens and current_chunk_sentences:
            chunk_text = ' '.join(current_chunk_sentences)
            chunks.append({
                'text': chunk_text,
                'token_count': current_tokens,
                'sentence_count': len(current_chunk_sentences),
                'has_overlap': len(chunks) > 0
            })

            # Create overlap: keep last few sentences
            overlap_sentences = []
            current_overlap_tokens = 0 # Renamed variable to avoid conflict with function parameter 'overlap_tokens'

            # Add sentences from the end until we reach overlap target
            # Ensure we don't go past the start of the chunk
            for sent_idx in range(len(current_chunk_sentences) - 1, -1, -1):
                sent = current_chunk_sentences[sent_idx]
                sent_tokens = len(encoding.encode(sent))
                if current_overlap_tokens + sent_tokens <= overlap_tokens:
                    overlap_sentences.insert(0, sent)
                    current_overlap_tokens += sent_tokens
                else:
                    break
            
            # If after trying to create overlap, we still don't have enough tokens for overlap
            # (e.g., first few sentences are very long), just take some minimal content.
            if not overlap_sentences and current_chunk_sentences:
                # Fallback to last sentence if no other overlap possible and current chunk exists
                overlap_sentences = [current_chunk_sentences[-1]]
                current_overlap_tokens = len(encoding.encode(overlap_sentences[0]))


            # Start new chunk with overlap + current sentence
            current_chunk_sentences = overlap_sentences + [sentence]
            current_tokens = current_overlap_tokens + sentence_tokens
        else:
            # Add sentence to current chunk
            current_chunk_sentences.append(sentence)
            current_tokens += sentence_tokens

    # Add final chunk if it has content
    if current_chunk_sentences:
        chunk_text = ' '.join(current_chunk_sentences)
        final_tokens = len(encoding.encode(chunk_text))

        if final_tokens >= min_tokens:
            chunks.append({
                'text': chunk_text,
                'token_count': final_tokens,
                'sentence_count': len(current_chunk_sentences),
                'has_overlap': len(chunks) > 0
            })

    return chunks

# =============================================================================
# 6. TABLE HANDLING
# =============================================================================

def extract_and_process_tables(content: str) -> Tuple[List[Dict], str]:
    """
    Extract tables and return both table chunks and narrative text
    """
    table_pattern = re.compile(r'=== TABLE START ===.*?=== TABLE END ===', re.DOTALL)
    tables = []

    # Find all tables
    for i, match in enumerate(table_pattern.finditer(content)):
        table_content = match.group(0)
        # Clean table markers
        table_text = table_content.replace('=== TABLE START ===', '').replace('=== TABLE END ===', '').strip()

        if table_text:  # Only add non-empty tables
            tables.append({
                'text': table_text,
                'token_count': len(encoding.encode(table_text)),
                'table_index': i,
                'chunk_type': 'table'
            })

    # Remove tables from content to get narrative text
    narrative_content = table_pattern.sub('', content).strip()

    return tables, narrative_content

# =============================================================================
# 8. TESTING AND VALIDATION
# =============================================================================

def validate_chunks(chunks: List[Chunk]) -> Dict[str, Any]:
    """
    Validate the quality of our chunks
    """
    if not chunks:
        return {"error": "No chunks created"}

    token_counts = [chunk.token_count for chunk in chunks]

    stats = {
        "total_chunks": len(chunks),
        "avg_tokens": sum(token_counts) / len(token_counts),
        "min_tokens": min(token_counts),
        "max_tokens": max(token_counts),
        "chunks_with_overlap": sum(1 for chunk in chunks if chunk.has_overlap),
        "table_chunks": sum(1 for chunk in chunks if chunk.chunk_type == 'table'),
        "narrative_chunks": sum(1 for chunk in chunks if chunk.chunk_type == 'narrative'),
        "unique_sections": len(set(chunk.section_info for chunk in chunks))
    }

    return stats

# =============================================================================
# 9. LET'S TEST THIS!
# =============================================================================

print("🚀 SEC Filing Preprocessing Strategy - Ready for Testing!")
print("="*60)
print("Key improvements over original approach:")
print("✅ Multi-strategy section detection with fallbacks")
print("✅ Sentence-aware chunking with overlap")
print("✅ Robust error handling and logging")
print("✅ Structured data classes for better organization")
print("✅ Quality validation and statistics")
print("✅ Separate table and narrative processing")
print("="*60)


def test_single_file():
    """Test our preprocessing on a single file"""
    # Replace with an actual file path from your processed_filings directory
    test_file = "processed_filings/AAPL/AAPL_10K_2020-10-30.txt"

    if os.path.exists(test_file):
        print(f"🧪 Testing with: {test_file}")
        print("="*50)

        # Changed to universal processing function
        chunks = process_filing_robust_universal(test_file)
        stats = validate_chunks(chunks)

        print("📊 Processing Results:")
        for key, value in stats.items():
            print(f"  {key}: {value}")

        print("\n📝 Sample Chunks:")
        for i, chunk in enumerate(chunks[:3]):  # Show first 3 chunks
            print(f"\nChunk {i+1} ({chunk.chunk_type}):")
            print(f"  Section: {chunk.section_info}")
            print(f"  Tokens: {chunk.token_count}")
            print(f"  Text preview: {chunk.text[:200]}...")

        return chunks
    else:
        print(f"❌ File not found: {test_file}")
        print("Please update the file path to match your data structure")
        return []

# Run the test
chunks = test_single_file()

def compare_section_strategies(content: str): # Changed content_sample to content to use full content
    """Compare how different strategies perform"""
    print("🔍 Comparing Section Detection Strategies")
    print("="*50)

    # Strategy 1: Robust regex
    sections_1 = detect_sections_strategy_1_improved(content) # Changed content_sample to content
    print(f"Strategy 1 (Regex): {len(sections_1)} sections")
    for i, section in enumerate(sections_1[:5]):  # Show first 5
        print(f"  {i+1}. {section.title[:60]}...")

    print()

    # Strategy 2: Page-based fallback
    sections_2 = detect_sections_strategy_2(content) # Changed content_sample to content
    print(f"Strategy 2 (Page-based): {len(sections_2)} sections")
    for i, section in enumerate(sections_2[:5]):  # Show first 5
        print(f"  {i+1}. {section.title[:60]}...")

    return sections_1, sections_2

# Test if we have chunks from previous test
if chunks:
    # Use the first chunk's filing to get the full content
    test_file = chunks[0].filing_metadata.file_path
    with open(test_file, 'r', encoding='utf-8') as f:
        # Load full content for comparison, not just a sample
        full_content_for_comparison = f.read()
    cleaned_content_for_comparison = clean_sec_text(full_content_for_comparison) # Clean it for consistent comparison

    sections_1_comp, sections_2_comp = compare_section_strategies(cleaned_content_for_comparison)


def analyze_chunking_quality(chunks: List[Chunk]):
    """Deep dive into chunk quality"""
    if not chunks:
        print("No chunks to analyze")
        return

    print("📊 Chunking Quality Analysis")
    print("="*50)

    # Token distribution
    token_counts = [chunk.token_count for chunk in chunks]

    print(f"Token Distribution:")
    print(f"  Mean: {sum(token_counts)/len(token_counts):.1f}")
    print(f"  Median: {sorted(token_counts)[len(token_counts)//2]}")
    print(f"  Min: {min(token_counts)}")
    print(f"  Max: {max(token_counts)}")

    # Chunk types
    chunk_types = {}
    for chunk in chunks:
        chunk_types[chunk.chunk_type] = chunk_types.get(chunk.chunk_type, 0) + 1

    print(f"\nChunk Types:")
    for chunk_type, count in chunk_types.items():
        print(f"  {chunk_type}: {count}")

    # Section distribution
    sections_dist = {} # Renamed to avoid conflict with `sections` list
    for chunk in chunks:
        sections_dist[chunk.section_info] = sections_dist.get(chunk.section_info, 0) + 1

    print(f"\nSection Distribution:")
    for section, count in sorted(sections_dist.items()):
        print(f"  {section}: {count} chunks")

    # Overlap analysis
    overlap_count = sum(1 for chunk in chunks if chunk.has_overlap)
    print(f"\nOverlap Analysis:")
    print(f"  Chunks with overlap: {overlap_count}/{len(chunks)} ({overlap_count/len(chunks)*100:.1f}%)")

    return {
        'token_stats': {
            'mean': sum(token_counts)/len(token_counts),
            'median': sorted(token_counts)[len(token_counts)//2],
            'min': min(token_counts),
            'max': max(token_counts)
        },
        'chunk_types': chunk_types,
        'sections': sections_dist,
        'overlap_rate': overlap_count/len(chunks)
    }

# Analyze our test chunks
if chunks:
    quality_analysis = analyze_chunking_quality(chunks)


def test_chunking_parameters():
    """Test different parameter combinations"""
    if not chunks:
        print("No test file processed yet")
        return

    test_file = chunks[0].filing_metadata.file_path

    print("🔧 Testing Different Chunking Parameters")
    print("="*50)

    # Test different parameter combinations
    param_configs = [
        {"target_tokens": 300, "overlap_tokens": 50, "name": "Small chunks, low overlap"},
        {"target_tokens": 500, "overlap_tokens": 100, "name": "Medium chunks, medium overlap"},
        {"target_tokens": 800, "overlap_tokens": 150, "name": "Large chunks, high overlap"},
    ]

    results = {}

    for config in param_configs:
        print(f"\n🧪 Testing: {config['name']}")
        # Changed to universal processing function
        test_chunks = process_filing_robust_universal(
            test_file,
            target_tokens=config['target_tokens'],
            overlap_tokens=config['overlap_tokens']
        )

        stats = validate_chunks(test_chunks)
        results[config['name']] = stats

        print(f"  Total chunks: {stats['total_chunks']}")
        print(f"  Avg tokens: {stats['avg_tokens']:.1f}")
        print(f"  Overlap rate: {stats['chunks_with_overlap']}/{stats['total_chunks']}")

    return results

# Test different parameters
param_results = test_chunking_parameters()


def test_error_handling():
    """Test how our system handles various edge cases"""
    print("🛡️ Testing Error Handling")
    print("="*50)

    # Test 1: Non-existent file
    print("Test 1: Non-existent file")
    # Changed to universal processing function
    fake_chunks = process_filing_robust_universal("non_existent_file.txt")
    print(f"  Result: {len(fake_chunks)} chunks (expected 0)")

    # Test 2: Empty file
    print("\nTest 2: Empty content")
    empty_sections = detect_sections_robust_universal("") # Changed to universal detection
    print(f"  Result: {len(empty_sections)} sections")

    # Test 3: Malformed filename
    print("\nTest 3: Malformed filename")
    # Create a temporary file with bad name
    import tempfile
    with tempfile.NamedTemporaryFile(mode='w', suffix='_bad_name.txt', delete=False) as f:
        f.write("Some content")
        temp_file = f.name

    # Changed to universal processing function
    bad_chunks = process_filing_robust_universal(temp_file)
    print(f"  Result: {len(bad_chunks)} chunks (expected 0)")

    # Clean up
    os.unlink(temp_file)

    # Test 4: Very short text
    print("\nTest 4: Very short text")
    # This call is correct, as create_overlapping_chunks is a helper
    short_chunks = create_overlapping_chunks("Short text.", target_tokens=500)
    print(f"  Result: {len(short_chunks)} chunks")

test_error_handling()


def test_batch_processing(max_files: int = 5):
    """Test processing multiple files"""
    print(f"🔄 Testing Batch Processing (max {max_files} files)")
    print("="*50)

    data_path = "processed_filings/"
    if not os.path.exists(data_path):
        print(f"❌ Data path not found: {data_path}")
        return []

    # Get all files
    all_files = []
    for root, dirs, files in os.walk(data_path):
        for file in files:
            if file.endswith('.txt'):
                all_files.append(os.path.join(root, file))

    # Process a subset
    test_files = all_files[:max_files]
    print(f"Processing {len(test_files)} files...")

    all_results = []

    for i, file_path in enumerate(test_files):
        print(f"  {i+1}/{len(test_files)}: {os.path.basename(file_path)}")

        # Changed to universal processing function
        file_chunks = process_filing_robust_universal(file_path)
        stats = validate_chunks(file_chunks)

        all_results.append({
            'file': os.path.basename(file_path),
            'chunks': len(file_chunks),
            'avg_tokens': stats.get('avg_tokens', 0),
            'sections': stats.get('unique_sections', 0),
            'tables': stats.get('table_chunks', 0)
        })

    # Summary statistics
    print(f"\n📊 Batch Processing Summary:")
    total_chunks = sum(r['chunks'] for r in all_results)
    avg_chunks_per_file = total_chunks / len(all_results) if all_results else 0

    print(f"  Total files processed: {len(all_results)}\n")
    print(f"  Total chunks created: {total_chunks}")
    print(f"  Average chunks per file: {avg_chunks_per_file:.1f}")

    print(f"\n📋 Per-file results:")
    for result in all_results:
        print(f"  {result['file']}: {result['chunks']} chunks, {result['sections']} sections, {result['tables']} tables")

    return all_results

# Run batch test
batch_results = test_batch_processing(max_files=3)


def create_analysis_summary():
    """Create a comprehensive summary of our preprocessing"""
    print("📈 Final Analysis Summary")
    print("="*60)

    # Assumes 'chunks' variable from test_single_file() is available
    if 'chunks' not in globals() or not chunks:
        print("No chunks to analyze - run test_single_file() first")
        return

    # Create a mini dataset for analysis
    chunk_data = []
    for chunk in chunks:
        chunk_data.append({
            'chunk_id': chunk.chunk_id,
            'tokens': chunk.token_count,
            'type': chunk.chunk_type,
            'section': chunk.section_info,
            'has_overlap': chunk.has_overlap,
            'ticker': chunk.filing_metadata.ticker,
            'form_type': chunk.filing_metadata.form_type,
            'fiscal_year': chunk.filing_metadata.fiscal_year
        })

    df = pd.DataFrame(chunk_data)

    print("🎯 Key Insights:")
    print(f"  • Document: {df['ticker'].iloc[0]} {df['form_type'].iloc[0]} (FY{df['fiscal_year'].iloc[0]})")
    print(f"  • Total chunks: {len(df)}")
    print(f"  • Average chunk size: {df['tokens'].mean():.0f} tokens")
    print(f"  • Size range: {df['tokens'].min()} - {df['tokens'].max()} tokens")
    print(f"  • Overlap rate: {(df['has_overlap'].sum() / len(df) * 100):.1f}%")

    print(f"\n📊 Chunk Distribution by Type:")
    type_dist = df['type'].value_counts()
    for chunk_type, count in type_dist.items():
        percentage = (count / len(df)) * 100
        print(f"  • {chunk_type}: {count} chunks ({percentage:.1f}%)")

    print(f"\n📚 Section Breakdown:")
    section_dist = df['section'].value_counts()
    for section, count in section_dist.head(8).items():  # Top 8 sections
        print(f"  • {section}: {count} chunks")

    # Quality metrics
    print(f"\n✅ Quality Metrics:")

    # Check for very small chunks (potential issues)
    small_chunks = df[df['tokens'] < 50]
    print(f"  • Very small chunks (<50 tokens): {len(small_chunks)} ({len(small_chunks)/len(df)*100:.1f}%)")

    # Check for very large chunks (might need splitting)
    large_chunks = df[df['tokens'] > 800]
    print(f"  • Large chunks (>800 tokens): {len(large_chunks)} ({len(large_chunks)/len(df)*100:.1f}%)")

    # Check section coverage
    unique_sections = df['section'].nunique()
    print(f"  • Unique sections identified: {unique_sections}")

    # Show some example chunks for manual review
    print(f"\n🔍 Sample Chunks for Review:")

    # Show one of each type
    for chunk_type in df['type'].unique():
        sample = df[df['type'] == chunk_type].iloc[0]
        # Find the actual chunk object to get its full text
        chunk_obj = next(c for c in chunks if c.chunk_id == sample['chunk_id'])
        print(f"\n  {chunk_type.upper()} example ({sample['tokens']} tokens):\n")
        print(f"    Section: {sample['section']}\n")
        print(f"    Preview: {chunk_obj.text[:150]}...\n")

    return df

# Create final summary
summary_df = create_analysis_summary()


def compare_with_original():
    """Compare our approach with the original chunking strategy"""
    print("⚖️ Comparison: New vs Original Approach")
    print("="*60)

    improvements = [
        "✅ Multi-strategy section detection (fallbacks for robustness)",
        "✅ Sentence-aware chunking (preserves semantic boundaries)",
        "✅ Overlapping chunks (maintains context across boundaries)",
        "✅ Separate table processing (handles structured data better)",
        "✅ Comprehensive error handling (graceful degradation)",
        "✅ Rich metadata structure (better for search/filtering)",
        "✅ Quality validation (ensures chunk coherence)",
        "✅ Configurable parameters (tunable for different use cases)"
    ]

    potential_tradeoffs = [
        "⚠️ Slightly more complex code (but more maintainable)",
        "⚠️ More chunks due to overlap (but better retrieval)",
        "⚠️ Processing takes longer (but more robust results)"
    ]

    print("🚀 Key Improvements:")
    for improvement in improvements:
        print(f"  {improvement}")

    print(f"\n⚖️ Potential Tradeoffs:")
    for tradeoff in potential_tradeoffs:
        print(f"  {tradeoff}")

    print(f"\n🎯 Recommended Next Steps:")
    next_steps = [
        "1. Test on more diverse filings to validate robustness",
        "2. Fine-tune chunking parameters based on embedding performance",
        "3. Add semantic similarity checks between overlapping chunks",
        "4. Implement incremental processing for large datasets",
        "5. Add support for other SEC forms (8-K, DEF 14A, etc.)",
        "6. Create embedding quality metrics and evaluation"
    ]

    for step in next_steps:
        print(f"  {step}")

    print("\n" + "="*60)
    print("🎉 Preprocessing Strategy Testing Complete!")
    print("="*60)
    print("Next step: Convert this notebook into modular Python files")
    print("Then: Implement the embedding pipeline and MCP server!")
    print("="*60)

compare_with_original()

# Test functions adapted to _fixed suffix to avoid NameErrors from notebook re-runs
# Ensure these are called after all function definitions.
print("🚀 Ready to test universal SEC detection!")
print("\n1. Run test_universal_detection_fixed() to test all files")
print("2. Run compare_old_vs_universal_fixed() to see the improvement")
print("3. Run quick_pattern_test_fixed() to see what patterns match")

# Define the _fixed test functions so they are available when called below
def test_universal_detection_fixed():
    """Test the universal detection on all your file types"""

    test_files = [
        "processed_filings/AAPL/AAPL_10K_2020-10-30.txt",
        "processed_filings/AMZN/AMZN_10K_2023-02-03.txt",
        "processed_filings/AMZN/AMZN_10Q_2024-11-01.txt",
        "processed_filings/KO/KO_10Q_2020-07-22.txt"  # If you have this one
    ]

    results = {}

    for test_file in test_files:
        if not os.path.exists(test_file):
            print(f"⚠️ Skipping {test_file} - file not found")
            continue

        print(f"\n🧪 Testing: {test_file}")
        print("=" * 80)

        with open(test_file, 'r', encoding='utf-8') as f:
            content = f.read()

        # Test universal detection
        sections = detect_sections_robust_universal(content)

        print(f"\n✅ Found {len(sections)} sections:")
        for i, section in enumerate(sections[:10]):  # Show first 10
            print(f"  {i+1}. {section.title}")
            print(f"     Type: {section.section_type}, Length: {len(section.content):,} chars")

        # Test full pipeline
        chunks = process_filing_robust_universal(test_file)
        stats = validate_chunks(chunks) if chunks else {"error": "No chunks created"}

        results[test_file] = {
            'sections': len(sections),
            'chunks': len(chunks) if chunks else 0,
            'stats': stats
        }

        print(f"\n📊 Processing Results:")
        for key, value in stats.items():
            print(f"  {key}: {value}")

        if chunks:
            # Show section distribution
            section_counts = {}
            for chunk in chunks[:20]:  # Sample first 20
                section = chunk.section_info
                section_counts[section] = section_counts.get(section, 0) + 1

            print(f"\n📚 Section Distribution (sample):\n")
            for section, count in sorted(section_counts.items()):
                print(f"  • {section}: {count} chunks\n")

    # Summary comparison
    print(f"\n" + "="*80)
    print("📊 UNIVERSAL DETECTION SUMMARY")
    print("="*80)

    for file_path, result in results.items():
        filename = file_path.split('/')[-1]
        print(f"{filename:<25} | {result['sections']:>2} sections | {result['chunks']:>3} chunks\n")

    return results

def compare_old_vs_universal_fixed():
    """Compare the old detection vs universal detection"""
    test_file = "processed_filings/AAPL/AAPL_10K_2020-10-30.txt"

    if not os.path.exists(test_file):
        print("Test file not found for comparison\n")
        return

    print("⚖️ OLD vs UNIVERSAL Detection Comparison\n")
    print("="*60)

    with open(test_file, 'r', encoding='utf-8') as f:
        content = f.read()

    # Old detection
    print("Running old detection...\n")
    old_sections = detect_sections_robust_old(content) # Use detect_sections_robust_old

    # New universal detection
    print("Running universal detection...\n")
    new_sections = detect_sections_robust_universal(content)

    print(f"\n📊 Comparison Results:\n")
    print(f"  Old detection: {len(old_sections)} sections\n")
    print(f"  Universal detection: {len(new_sections)} sections\n")
    print(f"  Improvement: +{len(new_sections) - len(old_sections)} sections\n")

    print(f"\n📋 Old Sections:\n")
    for i, section in enumerate(old_sections):
        print(f"  {i+1}. {section.title}\n")

    print(f"\n📋 Universal Sections:\n")
    for i, section in enumerate(new_sections):
        print(f"  {i+1}. {section.title}\n")

    return old_sections, new_sections

def quick_pattern_test_fixed():
    """Quick test to see what patterns match in your content"""
    test_file = "processed_filings/AAPL/AAPL_10K_2020-10-30.txt"

    if not os.path.exists(test_file):
        print("Test file not found\n")
        return

    with open(test_file, 'r', encoding='utf-8') as f:
        content = f.read()

    print("🔍 QUICK PATTERN TEST\n")
    print("="*50)

    # Test key patterns
    patterns = [
        (re.compile(r'\[TABLE_START\].*?Item.*?\\[TABLE_END\\]', re.I | re.DOTALL), "Table-wrapped Items"),
        (re.compile(r'Item\\s+\\d+[A-C]?\\.\\s*\\|', re.I), "Pipe-separated Items"),
        (re.compile(r'PART\\s+[IVX]+', re.I), "Part headers"),
        (re.compile(r'\[TABLE_START\].*?PART.*?\\[TABLE_END\\]', re.I | re.DOTALL), "Table-wrapped Parts"),
    ]

    for compiled_pattern, description in patterns:
        matches = compiled_pattern.findall(content) # Use compiled pattern
        print(f"\n{description}: {len(matches)} matches\n")
        for i, match in enumerate(matches[:3]):
            # Clean up match for display
            clean_match = ' '.join(match.split())[:100]
            print(f"  {i+1}: {clean_match}...\n")

# Run the fixed tests
results_universal = test_universal_detection_fixed()
old_vs_new_sections = compare_old_vs_universal_fixed()
quick_pattern_test_fixed()

INFO:__main__:Attempting universal SEC section detection
ERROR:__main__:Error processing processed_filings/AAPL/AAPL_10K_2020-10-30.txt: unbalanced parenthesis at position 65
ERROR:__main__:Error processing non_existent_file.txt: Unknown datetime string format, unable to parse: file, at position 0
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:Empty content provided to detect_sections_universal_sec. Returning empty sections.
INFO:__main__:Empty content provided to detect_sections_from_toc_universal. Returning empty sections.
ERROR:__main__:Error processing /var/folders/pj/bmp5122d3d77bzq_cvf0wbl40000gn/T/tmp2i0ej49u_bad_name.txt: Unknown datetime string format, unable to parse: name, at position 0
INFO:__main__:Attempting universal SEC section detection
ERROR:__main__:Error processing processed_filings/AMZN/AMZN_10Q_2022-04-29.txt: unbalanced parenthesis at position 65
INFO:__main__:Attempting universal SEC section detection
ERROR:__main__:Error processing proce

🚀 SEC Filing Preprocessing Strategy - Ready for Testing!
Key improvements over original approach:
✅ Multi-strategy section detection with fallbacks
✅ Sentence-aware chunking with overlap
✅ Robust error handling and logging
✅ Structured data classes for better organization
✅ Quality validation and statistics
✅ Separate table and narrative processing
🧪 Testing with: processed_filings/AAPL/AAPL_10K_2020-10-30.txt
🔍 Universal SEC detection found 0 unique sections:
📊 Processing Results:
  error: No chunks created

📝 Sample Chunks:
No test file processed yet
🛡️ Testing Error Handling
Test 1: Non-existent file
  Result: 0 chunks (expected 0)

Test 2: Empty content
  Result: 1 sections

Test 3: Malformed filename
  Result: 0 chunks (expected 0)

Test 4: Very short text
  Result: 0 chunks
🔄 Testing Batch Processing (max 3 files)
Processing 3 files...
  1/3: AMZN_10Q_2022-04-29.txt
🔍 Universal SEC detection found 0 unique sections:
  2/3: AMZN_10Q_2020-05-01.txt
🔍 Universal SEC detection found 0

error: unbalanced parenthesis at position 65

In [31]:
import os
import re
import pandas as pd
import tiktoken
from typing import List, Dict, Any, Tuple, Optional
from dataclasses import dataclass
from datetime import datetime
import logging
from pathlib import Path

# Set up logging to see what's happening
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Initialize tokenizer for accurate token counting
encoding = tiktoken.encoding_for_model("text-embedding-3-small")

# =============================================================================
# 1. SEC MAPPINGS WITH FALLBACKS
# =============================================================================

ITEM_NAME_MAP_10K = {
    "1": "Business",
    "1A": "Risk Factors",
    "1B": "Unresolved Staff Comments",
    "1C": "Cybersecurity",
    "2": "Properties",
    "3": "Legal Proceedings",
    "4": "Mine Safety Disclosures",
    "5": "Market for Registrant's Common Equity, Related Stockholder Matters and Issuer Purchases of Equity Securities",
    "6": "Reserved",
    "7": "Management's Discussion and Analysis of Financial Condition and Results of Operations",
    "7A": "Quantitative and Qualitative Disclosures About Market Risk",
    "8": "Financial Statements and Supplementary Data",
    "9": "Changes in and Disagreements With Accountants on Accounting and Financial Disclosure",
    "9A": "Controls and Procedures",
    "9B": "Other Information",
    "9C": "Disclosure Regarding Foreign Jurisdictions that Prevent Inspections",
    "10": "Directors, Executive Officers and Corporate Governance",
    "11": "Executive Compensation",
    "12": "Security Ownership of Certain Beneficial Owners and Management and Related Stockholder Matters",
    "13": "Certain Relationships and Related Transactions, and Director Independence",
    "14": "Principal Accountant Fees and Services",
    "15": "Exhibits, Financial Statement Schedules",
    "16": "Form 10-K Summary"
}

ITEM_NAME_MAP_10Q_PART_I = {
    "1": "Financial Statements",
    "2": "Management's Discussion and Analysis of Financial Condition and Results of Operations",
    "3": "Quantitative and Qualitative Disclosures About Market Risk",
    "4": "Controls and Procedures",
}

ITEM_NAME_MAP_10Q_PART_II = {
    "1": "Legal Proceedings", "1A": "Risk Factors",
    "2": "Unregistered Sales of Equity Securities and Use of Proceeds",
    "3": "Defaults Upon Senior Securities", "4": "Mine Safety Disclosures",
    "5": "Other Information", "6": "Exhibits",
}

# =============================================================================
# 2. DATA STRUCTURES FOR BETTER ORGANIZATION
# =============================================================================

@dataclass
class FilingMetadata:
    """Structured metadata for a filing"""
    ticker: str
    form_type: str
    filing_date: str
    fiscal_year: int
    fiscal_quarter: int
    file_path: str

@dataclass
class DocumentSection:
    """Represents a section of the document"""
    title: str
    content: str
    section_type: str  # 'item', 'part', 'intro', 'table'
    item_number: Optional[str] = None
    part: Optional[str] = None
    start_pos: int = 0
    end_pos: int = 0

@dataclass
class Chunk:
    """Final chunk with all metadata"""
    chunk_id: str
    text: str
    token_count: int
    chunk_type: str  # 'narrative', 'table', 'mixed'
    section_info: str
    filing_metadata: FilingMetadata
    chunk_index: int
    has_overlap: bool = False

# =============================================================================
# 3. ROBUST TEXT CLEANING
# =============================================================================

def clean_sec_text(text: str) -> str:
    """
    Clean SEC filing text more robustly
    """
    # Remove common SEC artifacts
    text = re.sub(r'UNITED STATES\s+SECURITIES AND EXCHANGE COMMISSION.*?FORM \d+[A-Z]*', '', text, flags=re.DOTALL | re.IGNORECASE)

    # Handle page breaks more intelligently
    text = text.replace('[PAGE BREAK]', '\n\n--- PAGE BREAK ---\n\n')

    # Preserve table boundaries but clean them up
    text = re.sub(r'\[TABLE_START\]', '\n\n=== TABLE START ===\n', text)
    text = re.sub(r'\[TABLE_END\]', '\n=== TABLE END ===\n\n', text)

    # Clean up excessive whitespace but preserve paragraph structure
    text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text)  # Multiple newlines -> double newline
    text = re.sub(r'[ \t]+', ' ', text)  # Multiple spaces/tabs -> single space
    text = re.sub(r'^\s+|\s+$', '', text, flags=re.MULTILINE)  # Trim lines

    return text.strip()

# =============================================================================
# 4. MULTI-STRATEGY SECTION DETECTION
# =============================================================================

def detect_sections_strategy_1_improved(content: str) -> List[DocumentSection]:
    """
    Improved Strategy 1: Patterns based on real SEC filing structure
    """
    sections = []

    # Much more comprehensive patterns based on your actual files
    patterns = [
        # PART patterns - handle various formats
        re.compile(r'^\s*PART\s+([IVX]+)(?:\s*[-–—].*?)?$', re.I | re.M),
        re.compile(r'^PART\s+([IVX]+)(?:\s*[-–—].*?)?$', re.I | re.M),

        # ITEM patterns - much more flexible
        re.compile(r'^\s*ITEM\s+(\d{1,2}[A-C]?)(?:[.\s–—])', re.I | re.M),
        re.compile(r'^ITEM\s+(\d{1,2}[A-C]?)(?:[.\s–—])', re.I | re.M),
        re.compile(r'Item\s+(\d{1,2}[A-C]?)(?:[.\s–—])', re.I | re.M),

        # Number-dot format common in SEC filings
        re.compile(r'^(\d{1,2}[A-C]?)\.\s+[A-Z][A-Za-z\s]{10,}', re.I | re.M),

        # Content-based patterns for known sections
        re.compile(r'^.{0,50}(BUSINESS)\s*$', re.I | re.M),
        re.compile(r'^.{0,50}(RISK FACTORS)\s*$', re.I | re.M),
        re.compile(r'^.{0,50}(LEGAL PROCEEDINGS)\s*$', re.I | re.M),
        re.compile(r'^.{0,50}(FINANCIAL STATEMENTS)\s*$', re.I | re.M),
        re.compile(r'^.{0,50}(MANAGEMENT.S DISCUSSION)\s*', re.I | re.M),
        re.compile(r'^.{0,50}(PROPERTIES)\s*$', re.I | re.M),
        re.compile(r'^.{0,50}(CONTROLS AND PROCEDURES)\s*$', re.I | re.M),
    ]

    all_matches = []

    # Process each pattern
    for pattern_idx, pattern in enumerate(patterns):
        for match in pattern.finditer(content): # Use pre-compiled pattern
            # Get the full line containing this match
            line_start = content.rfind('\n', 0, match.start()) + 1
            line_end = content.find('\n', match.end())
            if line_end == -1:
                line_end = len(content)

            full_line = content[line_start:line_end].strip()

            # Filter out obvious false positives
            if (len(full_line) > 400 or  # Too long to be a header
                len(full_line) < 3 or    # Too short
                '|' in full_line or      # Likely table content
                full_line.count(' ') > 20):  # Too many words
                continue

            # Extract section identifier
            section_id = match.group(1) if match.groups() else 'unknown'

            all_matches.append({
                'start_pos': line_start, # Changed from match.start() for consistency with line-based detection
                'end_pos': line_end,     # Changed from match.end()
                'full_line': full_line,
                'section_id': section_id,
                'pattern_idx': pattern_idx,
                'match_start': match.start()
            })

    # Remove duplicates - matches within 200 characters of each other
    unique_matches = []
    for match in sorted(all_matches, key=lambda x: x['start_pos']):
        is_duplicate = any(
            abs(match['start_pos'] - existing['start_pos']) < 200
            for existing in unique_matches
        )
        if not is_duplicate:
            unique_matches.append(match)

    # Debug output
    print(f"🔍 Improved detection found {len(unique_matches)} potential sections:")
    for i, match in enumerate(unique_matches[:15]):  # Show more for debugging
        print(f"  {i+1}: {match['full_line'][:80]}...")

    # Convert to DocumentSection objects
    for i, match in enumerate(unique_matches):
        start_pos = match['start_pos']
        end_pos = unique_matches[i + 1]['start_pos'] if i + 1 < len(unique_matches) else len(content)

        section_content = content[start_pos:end_pos].strip()

        # Determine section type and metadata
        full_line_upper = match['full_line'].upper()
        section_id = match['section_id'].upper() if match['section_id'] != 'unknown' else None

        if 'PART' in full_line_upper and section_id:
            section_type = 'part'
            part = f"PART {section_id}"
            item_number = None
            title = f"Part {section_id}"
        elif ('ITEM' in full_line_upper or re.match(r'^\d+[A-C]?$', str(section_id))) and section_id:
            section_type = 'item'
            part = None
            item_number = section_id
            title = f"Item {section_id}"
        elif any(keyword in full_line_upper for keyword in
                ['BUSINESS', 'RISK', 'LEGAL', 'FINANCIAL', 'MANAGEMENT', 'PROPERTIES', 'CONTROLS']):
            section_type = 'named_section'
            part = None
            item_number = None
            title = match['full_line']
        else:
            section_type = 'content'
            part = None
            item_number = None
            title = match['full_line']

        sections.append(DocumentSection(
            title=title,
            content=section_content,
            section_type=section_type,
            item_number=item_number,
            part=part,
            start_pos=start_pos,
            end_pos=end_pos
        ))

    return sections

def detect_sections_strategy_2(content: str) -> List[DocumentSection]:
    """
    Strategy 2: Fallback using page breaks and heuristics
    """
    sections = []

    # Split by page breaks first
    pages = content.split('--- PAGE BREAK ---')

    current_section = ""
    current_title = "Document Content"

    for i, page in enumerate(pages):
        page = page.strip()
        if not page:
            continue

        # Look for section headers in the page
        lines = page.split('\n')
        potential_headers = []

        for j, line in enumerate(lines[:10]):  # Check first 10 lines of each page
            line = line.strip()
            if (len(line) < 100 and  # Headers are usually short
                (re.search(r'\b(ITEM|PART)\b', line, re.IGNORECASE) or
                 re.search(r'\b(BUSINESS|RISK FACTORS|FINANCIAL STATEMENTS)\b', line, re.IGNORECASE))):
                potential_headers.append((j, line))

        if potential_headers:
            # Found a header, start new section
            if current_section:
                sections.append(DocumentSection(
                    title=current_title,
                    content=current_section.strip(),
                    section_type='content',
                    start_pos=0,
                    end_pos=len(current_section)
                ))

            current_title = potential_headers[0][1]
            current_section = page
        else:
            # Continue current section
            current_section += "\n\n" + page

    # Add the last section
    if current_section:
        sections.append(DocumentSection(
            title=current_title,
            content=current_section.strip(),
            section_type='content',
            start_pos=0,
            end_pos=len(current_section)
        ))

    return sections

# The `detect_sections_robust` function from your original code (renamed detect_sections_robust_old to avoid conflict)
def detect_sections_robust_old(content: str) -> List[DocumentSection]:
    """
    Multi-strategy section detection with fallbacks (original version)
    """
    logger.info("Attempting Strategy 1: Regex-based section detection")
    sections = detect_sections_strategy_1_improved(content) # Original called detect_sections_strategy_1, updated to _improved

    if len(sections) >= 3:  # A reasonable number of sections to consider it successful
        logger.info(f"Strategy 1 successful: Found {len(sections)} sections")
        return sections

    logger.warning("Strategy 1 failed, trying Strategy 2: Page-based detection")
    sections = detect_sections_strategy_2(content)

    if len(sections) >= 2:
        logger.info(f"Strategy 2 successful: Found {len(sections)} sections")
        return sections

    logger.warning("All strategies failed, creating single section")
    return [DocumentSection(
        title="Full Document",
        content=content,
        section_type='document',
        start_pos=0,
        end_pos=len(content)
    )]

def create_section_info(section: DocumentSection, form_type: str) -> str:
    """
    Create human-readable section information
    """
    if section.section_type == 'item' and section.item_number:
        if form_type == '10K':
            item_name = ITEM_NAME_MAP_10K.get(section.item_number, "Unknown Section")
            return f"Item {section.item_number} - {item_name}"
        elif form_type == '10Q':
            # Determine which part this item belongs to
            if section.item_number in ITEM_NAME_MAP_10Q_PART_I:
                item_name = ITEM_NAME_MAP_10Q_PART_I[section.item_number]
                return f"Part I, Item {section.item_number} - {item_name}"
            else:
                item_name = ITEM_NAME_MAP_10Q_PART_II.get(section.item_number, "Unknown Section")
                return f"Part II, Item {section.item_number} - {item_name}"

    elif section.section_type == 'part' and section.part:
        return section.part

    else:
        return section.title or "Document Content"

def detect_sections_universal_sec(content: str) -> List[DocumentSection]:
    """
    Universal section detection for SEC filings with table-based formatting
    """
    sections = []

    if not content: # Added check for empty content
        logger.info("Empty content provided to detect_sections_universal_sec. Returning empty sections.")
        return sections

    # Universal patterns for table-formatted SEC filings
    # Using re.escape for literal brackets and compiling patterns once.
    patterns = [
        re.compile(r'(?i)\[\[TABLE_START\]\].*?Item\s+(\d{1,2}[A-C]?)\.\s*\|\s*([^\[]+?)\s*\[\[TABLE_END\]\]', re.DOTALL),
        re.compile(r'(?i)\[\[TABLE_START\]\].*?Item\s+(\d{1,2}[A-C]?)\.\s*\|\s*([^|]+)', re.DOTALL),

        re.compile(r'(?i)\[\[TABLE_START\]\].*?PART\s+([IVX]+)\s*\|\s*([^\[]+?)\s*\[\[TABLE_END\]\]', re.DOTALL),
        re.compile(r'(?i)\[\[TABLE_START\]\].*?PART\s+([IVX]+)\s*\|\s*([^|]+)', re.DOTALL),
        re.compile(r'(?i)\[\[TABLE_START\]\].*?PART\s+([IVX]+)\s*\[\[TABLE_END\]\]', re.DOTALL),

        re.compile(r'(?i)^\s*Item\s+(\d{1,2}[A-C]?)\.\s*([^\n]+)', re.M),
        re.compile(r'(?i)Item\s+(\d{1,2}[A-C]?)\.\s*\|\s*([^|]+)', re.DOTALL),

        re.compile(r'(?i)^\s*PART\s+([IVX]+)\s*([^\n]*)', re.M),
        re.compile(r'(?i)PART\s+([IVX]+)\s*\|\s*([^|]+)', re.DOTALL),

        re.compile(r'(?i)\[\[TABLE_START\]\].*?(\d{1,2}[A-C]?)\.\s*\|\s*([^|]+)', re.DOTALL),

        re.compile(r'(?i)^(Item\\s+\\d{1,2}[A-C]?\\.\\s+[^|]+?)$', re.M), # Standardize this escaping
        re.compile(r'^(PART\\s+[IVX]+)(?:\\s*[-–—]\\s*(.+))?$', re.M),
    ]

    all_matches = []

    for pattern_idx, pattern in enumerate(patterns):
        for match in pattern.finditer(content): # Use pre-compiled pattern
            # Get context around the match (full line)
            line_start = content.rfind('\n', 0, match.start()) + 1
            line_end = content.find('\n', match.end())
            if line_end == -1:
                line_end = len(content)

            full_line = content[line_start:line_end].strip()

            # Filter out obvious false positives
            if (len(full_line) > 400 or  # Too long to be a header
                len(full_line) < 3 or    # Too short
                '|' in full_line or      # Likely table content
                full_line.count(' ') > 20):  # Too many words
                continue

            # Extract section information
            groups = match.groups()

            section_id = groups[0].strip() if groups else 'unknown'
            section_title = ""

            if len(groups) >= 2 and groups[1]:
                section_title = groups[1].strip()
                # Clean up section title from table markers
                section_title = re.sub(r'\[\[TABLE_END\]\].*', '', section_title).strip() # Using [[...]]
                section_title = section_title.replace('|', '').strip()
            elif len(groups) == 1:
                # For patterns that only capture an ID, try to get the rest of the line as title
                line_after_id_match = content[match.end():].split('\n')[0].strip()
                if line_after_id_match:
                    section_title = line_after_id_match
                else:
                    section_title = f"Section {section_id}" # Generic fallback title
            else:
                section_title = full_line # Fallback to full line as title

            all_matches.append({
                'start_pos': match.start(),
                'end_pos': match.end(),
                'full_line': full_line,
                'section_id': section_id,
                'section_title': section_title,
                'pattern_idx': pattern_idx,
                'match_start': match.start()
            })

    # Sort matches by their start position
    all_matches.sort(key=lambda x: x['start_pos'])

    # Remove duplicates and prioritize 'better' matches
    final_matches = []
    if all_matches:
        final_matches.append(all_matches[0]) # Add the first match
        for i in range(1, len(all_matches)):
            current_match = all_matches[i]
            last_added_match = final_matches[-1]

            # Heuristic for deciding whether to add or replace:
            # 1. If current match is significantly after the last added match, add it.
            if current_match['start_pos'] - last_added_match['start_pos'] > 150: # Increased distance for new section
                final_matches.append(current_match)
            # 2. If current match is very close, but provides a more specific 'item' or 'part' ID
            #    and the last added one was generic 'unknown' or less specific.
            elif current_match['start_pos'] - last_added_match['start_pos'] < 50: # Close proximity
                if last_added_match['section_id'] == 'unknown' and current_match['section_id'] != 'unknown':
                    final_matches[-1] = current_match # Replace with more specific match
                elif last_added_match['section_id'] == current_match['section_id'] and last_added_match['pattern_idx'] > current_match['pattern_idx']:
                    # If same ID but new pattern has higher priority (lower index means earlier in list)
                    final_matches[-1] = current_match

    logger.info(f"🔍 Universal SEC detection found {len(final_matches)} unique sections:")
    for i, match in enumerate(final_matches[:15]):
        logger.info(f"  {i+1}: Item/Part {match['section_id']} - {match['section_title'][:60]}...")

    # Convert to DocumentSection objects
    final_document_sections = []
    for i, match in enumerate(final_matches):
        start_pos = match['start_pos']
        # End position is the start of the next matched section, or end of content if it's the last one
        end_pos = final_matches[i + 1]['start_pos'] if i + 1 < len(final_matches) else len(content)

        section_content = content[start_pos:end_pos].strip()

        # Determine section type and metadata
        section_id = match['section_id'].upper()
        title = match['section_title'] # Use the extracted section title

        section_type = 'content' # Default type
        item_number = None
        part = None

        if re.match(r'^[IVX]+$', section_id):
            section_type = 'part'
            part = f"PART {section_id}"
            if title.upper().startswith("PART ") and title.upper().replace("PART ", "").strip() == section_id: # If it's just a generic "Part X" title
                title = part # Use the standardized PART title
            elif not title: # If no specific title was captured
                 title = part
        elif re.match(r'^\d+[A-C]?$', section_id):
            section_type = 'item'
            item_number = section_id
            if title.upper().startswith("ITEM ") and title.upper().replace("ITEM ", "").strip() == section_id: # If it's just a generic "Item X" title
                title = f"Item {item_number}" # Use the standardized ITEM title
            elif not title: # If no specific title was captured
                 title = f"Item {item_number}"

        final_document_sections.append(DocumentSection(
            title=title,
            content=section_content,
            section_type=section_type,
            item_number=item_number,
            part=part,
            start_pos=start_pos,
            end_pos=end_pos
        ))

    return final_document_sections

def detect_sections_from_toc_universal(content: str) -> List[DocumentSection]:
    """
    Extract sections from table of contents - works for any SEC filing.
    This function primarily identifies section titles and item numbers from TOC,
    but does not extract their content directly.
    """
    sections = []

    if not content: # Added check for empty content
        logger.info("Empty content provided to detect_sections_from_toc_universal. Returning empty sections.")
        return sections

    # Look for table of contents patterns
    # Using re.escape for literal brackets and compiling patterns once.
    toc_patterns = [
        re.compile(r'(?i)INDEX.*?(?=\\[PAGE BREAK\\])', re.DOTALL),
        re.compile(r'(?i)TABLE OF CONTENTS.*?(?=\\[PAGE BREAK\\])', re.DOTALL),
        re.compile(r'(?i)FORM 10-[KQ].*?INDEX.*?(?=\\[PAGE BREAK\\])', re.DOTALL), # Common variant
        re.compile(r'(?i)\[\[TABLE_START\]\].*?Page.*?\\[\[TABLE_END\]\].*?(?=\\[PAGE BREAK\\])', re.DOTALL), # Hyper-defensive escaping
    ]

    toc_content = ""
    for pattern in toc_patterns:
        match = pattern.search(content) # Use pre-compiled pattern
        if match:
            toc_content = match.group(0)
            break

    if not toc_content:
        logger.warning("No table of contents found in detect_sections_from_toc_universal.")
        return sections # Return empty list if no TOC found

    logger.info(f"Found table of contents ({len(toc_content)} chars)")

    # Define patterns for items/parts within the TOC
    # Using re.escape for literal brackets and compiling patterns once.
    item_patterns = [
        re.compile(r'(?i)Item\\s+(\\d{1,2}[A-C]?)\\.\\s*\\|\\s*([^|]+?)\\s*\\|\\s*\\d+', re.DOTALL), # Standard table format
        re.compile(r'(?i)PART\\s+([IVX]+)\\s*\\|\\s*([^|]+)', re.DOTALL), # Part in table
        re.compile(r'(?i)Item\\s+(\\d{1,2}[A-C]?)\\.\\s+([^|\\d]+)', re.M), # Standalone Item line
        re.compile(r'(?i)(\\d{1,2}[A-C]?)\\.\\s*\\|\\s*([^|]+?)\\s*\\|\\s*\\d+', re.DOTALL), # Number-dot item in table
        re.compile(r'(?i)PART\\s+([IVX]+)', re.M) # Simple PART line
    ]

    found_items = []
    for pattern in item_patterns:
        for match in pattern.finditer(toc_content): # Use pre-compiled pattern and toc_content
            groups = match.groups()
            if len(groups) >= 2: # Pattern captured both ID and Title
                item_id = groups[0].strip()
                item_title = groups[1].strip()
                item_title = re.sub(r'\\s+', ' ', item_title) # Normalize whitespace
                found_items.append((item_id, item_title))
            elif len(groups) == 1: # Pattern only captured ID (e.g., simple PART)
                item_id = groups[0].strip()
                # Attempt to get text immediately following the item_id if available, otherwise use a generic title
                # This makes the TOC parsing more robust for less structured TOCs.
                line_after_id_match = toc_content[match.end():].split('\n')[0].strip()
                if line_after_id_match:
                    item_title = line_after_id_match
                else:
                    item_title = f"Section {item_id}"
                item_title = re.sub(r'\\s+', ' ', item_title)
                found_items.append((item_id, item_title))


    # Remove duplicates
    unique_items = []
    seen = set()
    for item_id, title in found_items:
        key = f"{item_id}_{title[:50]}" # Use a longer slice for better uniqueness
        if key not in seen:
            unique_items.append((item_id, title))
            seen.add(key)

    logger.info(f"Extracted {len(unique_items)} sections from table of contents:")
    for item_id, title in unique_items[:10]:
        logger.info(f"  • {item_id}: {title[:50]}...")

    toc_sections = []
    for item_id, title in unique_items:
        section_type = 'unknown'
        item_number = None
        part_num = None

        if re.match(r'^\d+[A-C]?$', item_id):
            section_type = 'item'
            item_number = item_id
        elif re.match(r'^[IVX]+$', item_id):
            section_type = 'part'
            part_num = item_id

        toc_sections.append(DocumentSection(
            title=title,
            content="", # Content is intentionally empty here; will be filled by main sectioning if this strategy is chosen.
            section_type=section_type,
            item_number=item_number,
            part=part_num
        ))
    return toc_sections


def detect_sections_robust_universal(content: str) -> List[DocumentSection]:
    """
    Universal robust section detection for all SEC filings.
    Prioritizes direct pattern matching (which handles tables well), then TOC, then page-based.
    """
    logger.info("Attempting universal SEC section detection")

    # Strategy 1: Direct pattern matching for sections (designed to work well with common SEC patterns)
    sections_strategy1 = detect_sections_universal_sec(content)

    if len(sections_strategy1) >= 3: # A reasonable number of sections to consider it successful
        logger.info(f"Universal detection successful (Strategy 1): Found {len(sections_strategy1)} sections.")
        return sections_strategy1

    # Strategy 2: Try parsing Table of Contents. If successful and provides many sections,
    # it indicates a structured document. We'll use these titles and re-scan the document
    # for their content.
    logger.warning("Direct detection found few sections, analyzing table of contents.")
    toc_entries = detect_sections_from_toc_universal(content) # These are DocumentSections with only title/metadata, no content

    if toc_entries and len(toc_entries) >= 3: # If TOC parsing yielded a good number of entries
        logger.info(f"TOC analysis found {len(toc_entries)} potential sections. Attempting to extract content based on TOC titles.")
        
        combined_sections = []
        
        # This part of the logic needs to be robust for mapping TOC titles back to content.
        # A simple approach would be to find the starting position of each TOC title in the main content.
        # This can be error-prone if titles are not unique or are slightly different in the body.
        
        # For a more reliable "TOC-based content extraction," you'd typically:
        # 1. Take the sorted TOC titles.
        # 2. Iterate through the main `content` to find each title's occurrence.
        # 3. The content of a section is from its title's start to the next title's start.
        
        # Let's implement a basic version of that here to utilize the TOC entries.
        # This is an enhancement to strategy 2, making it more effective than just
        # returning the TOC entries without content.
        
        current_content_pos = 0
        for i, toc_entry in enumerate(toc_entries):
            # Create a flexible regex for the title to find it in the main content
            # Escape special regex characters in the title and allow for variable whitespace
            search_title = re.escape(toc_entry.title).replace('\\ ', '\\s*')
            
            # Prioritize matching by Item/Part numbers if available, as they are more unique
            if toc_entry.item_number:
                search_pattern = re.compile(r'(?i)^\s*(?:Item\s*' + re.escape(toc_entry.item_number) + r'|' + search_title + r')', re.M)
            elif toc_entry.part:
                search_pattern = re.compile(r'(?i)^\s*(?:PART\s*' + re.escape(toc_entry.part) + r'|' + search_title + r')', re.M)
            else:
                search_pattern = re.compile(r'(?i)^\s*' + search_title, re.M)

            match = search_pattern.search(content, pos=current_content_pos)
            
            if match:
                start_pos = match.start()
                
                # The content for this section goes until the start of the next TOC entry, or end of document
                next_start_pos = len(content)
                if i + 1 < len(toc_entries):
                    next_toc_entry_title = re.escape(toc_entries[i+1].title).replace('\\ ', '\\s*')
                    if toc_entries[i+1].item_number:
                        next_pattern = re.compile(r'(?i)^\s*(?:Item\s*' + re.escape(toc_entries[i+1].item_number) + r'|' + next_toc_entry_title + r')', re.M)
                    elif toc_entries[i+1].part:
                        next_pattern = re.compile(r'(?i)^\s*(?:PART\s*' + re.escape(toc_entries[i+1].part) + r'|' + next_toc_entry_title + r')', re.M)
                    else:
                        next_pattern = re.compile(r'(?i)^\s*' + next_toc_entry_title, re.M)
                    
                    next_match = next_pattern.search(content, pos=start_pos + len(match.group(0))) # Search after current match
                    if next_match:
                        next_start_pos = next_match.start()
                
                section_content = content[start_pos:next_start_pos].strip()
                
                combined_sections.append(DocumentSection(
                    title=toc_entry.title,
                    content=section_content,
                    section_type=toc_entry.section_type,
                    item_number=toc_entry.item_number,
                    part=toc_entry.part,
                    start_pos=start_pos,
                    end_pos=next_start_pos
                ))
                current_content_pos = next_start_pos
            else:
                logger.warning(f"Could not find content for TOC entry: {toc_entry.title}. Skipping or appending as part of previous.")
                # If a TOC entry is not found, its content might be part of the previous section,
                # or it's a false positive in the TOC. For simplicity, we just move on.
                # A more advanced approach would merge its expected content with the previous section.

        if len(combined_sections) >= 3: # If TOC-based content extraction yields good results
            logger.info(f"Universal detection successful (TOC-based content mapping): Found {len(combined_sections)} sections.")
            return combined_sections
        else:
            logger.warning("TOC-based content mapping yielded few sections. Falling back to page-based detection.")


    # Strategy 3: Page-based fallback (original strategy 2)
    logger.warning("Trying page-based detection as fallback.")
    sections_strategy2 = detect_sections_strategy_2(content)

    if len(sections_strategy2) >= 2:
        logger.info(f"Page-based detection successful: Found {len(sections_strategy2)} sections.")
        return sections_strategy2

    # Final fallback: return the entire document as a single section
    logger.warning("All strategies failed, creating single section.")
    return [DocumentSection(
        title="Full Document",
        content=content,
        section_type='document',
        start_pos=0,
        end_pos=len(content)
    )]


# Helper function to extract metadata from filename
def extract_metadata_from_filename(file_path: str) -> FilingMetadata:
    filename = Path(file_path).name
    file_id = filename.replace(".txt", "")
    parts = file_id.split('_')

    if len(parts) != 3:
        logger.warning(f"Malformed filename: {filename}. Using default metadata.")
        return FilingMetadata(
            ticker="UNKNOWN",
            form_type="UNKNOWN",
            filing_date="1900-01-01",
            fiscal_year=1900,
            fiscal_quarter=1,
            file_path=file_path
        )

    ticker, form_type, filing_date_str = parts

    try:
        filing_date = pd.to_datetime(filing_date_str)
        fiscal_year = filing_date.year
        fiscal_quarter = filing_date.quarter
    except pd.errors.ParserError:
        logger.error(f"Could not parse filing date from {filing_date_str} in {filename}. Using default values.")
        fiscal_year = 1900
        fiscal_quarter = 1

    # Adjust fiscal year for 10-K filings if the filing date is early in the calendar year
    # and typically refers to the previous fiscal year end.
    if form_type == '10K' and filing_date.month <= 3: # Assuming fiscal year ends typically in Dec or Jan-Mar for previous year
        fiscal_year -= 1 # Often a 10K filed in Jan-Mar of current year is for previous fiscal year

    return FilingMetadata(
        ticker=ticker,
        form_type=form_type,
        filing_date=filing_date_str,
        fiscal_year=fiscal_year,
        fiscal_quarter=fiscal_quarter,
        file_path=file_path
    )


# =============================================================================
# MAIN PROCESSING FUNCTION (Universal)
# =============================================================================
def process_filing_robust_universal(file_path: str, target_tokens: int = 500, overlap_tokens: int = 100) -> List[Chunk]:
    """
    Universal processing function for all SEC filings
    """
    try:
        # Extract filing metadata
        filing_metadata = extract_metadata_from_filename(file_path)
        filename = Path(file_path).name # For logging clarity
        file_id = filename.replace(".txt", "") # For chunk_id creation

        # Read and clean content
        with open(file_path, 'r', encoding='utf-8') as f:
            raw_content = f.read()
        cleaned_content = clean_sec_text(raw_content)

        # Basic check for empty content after cleaning
        if not cleaned_content.strip():
            logger.warning(f"Cleaned content for {filename} is empty. No chunks created.")
            return []

        # Use universal section detection
        sections = detect_sections_robust_universal(cleaned_content)
        logger.info(f"Found {len(sections)} sections in {filename}")

        # Process each section
        all_chunks = []
        chunk_counter = 0

        for section in sections:
            # Ensure section.content is not empty before processing
            if not section.content.strip():
                continue # Skip empty sections

            # Extract tables and narrative from this section's content
            tables_in_section, narrative_content_in_section = extract_and_process_tables(section.content)

            # Create section info string using the original create_section_info
            section_info = create_section_info(section, filing_metadata.form_type)

            # Process tables found within this section
            for table in tables_in_section:
                chunk = Chunk(
                    chunk_id=f"{file_id}-chunk-{chunk_counter:04d}",
                    text=table['text'],
                    token_count=table['token_count'],
                    chunk_type='table',
                    section_info=section_info,
                    filing_metadata=filing_metadata,
                    chunk_index=chunk_counter,
                    has_overlap=False
                )
                all_chunks.append(chunk)
                chunk_counter += 1

            # Process narrative content from this section
            if narrative_content_in_section.strip():
                # Use the existing create_overlapping_chunks for narrative
                narrative_sub_chunks = create_overlapping_chunks(
                    narrative_content_in_section, target_tokens, overlap_tokens
                )

                for chunk_data in narrative_sub_chunks:
                    chunk = Chunk(
                        chunk_id=f"{file_id}-chunk-{chunk_counter:04d}",
                        text=chunk_data['text'],
                        token_count=chunk_data['token_count'],
                        chunk_type='narrative',
                        section_info=section_info,
                        filing_metadata=filing_metadata,
                        chunk_index=chunk_counter,
                        has_overlap=chunk_data['has_overlap']
                    )
                    all_chunks.append(chunk)
                    chunk_counter += 1

        logger.info(f"Created {len(all_chunks)} chunks for {filename}")
        return all_chunks

    except Exception as e:
        logger.error(f"Error processing {file_path}: {e}")
        return []

# =============================================================================
# 5. IMPROVED SENTENCE-AWARE CHUNKING
# =============================================================================

def split_into_sentences(text: str) -> List[str]:
    """
    Split text into sentences using multiple heuristics
    """
    # Simple sentence splitting (can be improved with spaCy/NLTK)
    sentences = re.split(r'(?<=[.!?])\s+(?=[A-Z])', text)

    # Clean up sentences
    sentences = [s.strip() for s in sentences if s.strip()]

    return sentences

def create_overlapping_chunks(text: str, target_tokens: int = 500, overlap_tokens: int = 100,
                            min_tokens: int = 50) -> List[Dict[str, Any]]:
    """
    Create semantically aware chunks with overlap
    """
    sentences = split_into_sentences(text)
    chunks = []

    current_chunk_sentences = []
    current_tokens = 0

    for i, sentence in enumerate(sentences):
        sentence_tokens = len(encoding.encode(sentence))

        # If adding this sentence exceeds target, finalize current chunk
        if current_tokens + sentence_tokens > target_tokens and current_chunk_sentences:
            chunk_text = ' '.join(current_chunk_sentences)
            chunks.append({
                'text': chunk_text,
                'token_count': current_tokens,
                'sentence_count': len(current_chunk_sentences),
                'has_overlap': len(chunks) > 0
            })

            # Create overlap: keep last few sentences
            overlap_sentences = []
            current_overlap_tokens = 0 # Renamed variable to avoid conflict with function parameter 'overlap_tokens'

            # Add sentences from the end until we reach overlap target
            # Ensure we don't go past the start of the chunk
            for sent_idx in range(len(current_chunk_sentences) - 1, -1, -1):
                sent = current_chunk_sentences[sent_idx]
                sent_tokens = len(encoding.encode(sent))
                if current_overlap_tokens + sent_tokens <= overlap_tokens:
                    overlap_sentences.insert(0, sent)
                    current_overlap_tokens += sent_tokens
                else:
                    break
            
            # If after trying to create overlap, we still don't have enough tokens for overlap
            # (e.g., first few sentences are very long), just take some minimal content.
            if not overlap_sentences and current_chunk_sentences:
                # Fallback to last sentence if no other overlap possible and current chunk exists
                overlap_sentences = [current_chunk_sentences[-1]]
                current_overlap_tokens = len(encoding.encode(overlap_sentences[0]))


            # Start new chunk with overlap + current sentence
            current_chunk_sentences = overlap_sentences + [sentence]
            current_tokens = current_overlap_tokens + sentence_tokens
        else:
            # Add sentence to current chunk
            current_chunk_sentences.append(sentence)
            current_tokens += sentence_tokens

    # Add final chunk if it has content
    if current_chunk_sentences:
        chunk_text = ' '.join(current_chunk_sentences)
        final_tokens = len(encoding.encode(chunk_text))

        if final_tokens >= min_tokens:
            chunks.append({
                'text': chunk_text,
                'token_count': final_tokens,
                'sentence_count': len(current_chunk_sentences),
                'has_overlap': len(chunks) > 0
            })

    return chunks

# =============================================================================
# 6. TABLE HANDLING
# =============================================================================

def extract_and_process_tables(content: str) -> Tuple[List[Dict], str]:
    """
    Extract tables and return both table chunks and narrative text
    """
    table_pattern = re.compile(r'=== TABLE START ===.*?=== TABLE END ===', re.DOTALL)
    tables = []

    # Find all tables
    for i, match in enumerate(table_pattern.finditer(content)):
        table_content = match.group(0)
        # Clean table markers
        table_text = table_content.replace('=== TABLE START ===', '').replace('=== TABLE END ===', '').strip()

        if table_text:  # Only add non-empty tables
            tables.append({
                'text': table_text,
                'token_count': len(encoding.encode(table_text)),
                'table_index': i,
                'chunk_type': 'table'
            })

    # Remove tables from content to get narrative text
    narrative_content = table_pattern.sub('', content).strip()

    return tables, narrative_content

# =============================================================================
# 8. TESTING AND VALIDATION
# =============================================================================

def validate_chunks(chunks: List[Chunk]) -> Dict[str, Any]:
    """
    Validate the quality of our chunks
    """
    if not chunks:
        return {"error": "No chunks created"}

    token_counts = [chunk.token_count for chunk in chunks]

    stats = {
        "total_chunks": len(chunks),
        "avg_tokens": sum(token_counts) / len(token_counts),
        "min_tokens": min(token_counts),
        "max_tokens": max(token_counts),
        "chunks_with_overlap": sum(1 for chunk in chunks if chunk.has_overlap),
        "table_chunks": sum(1 for chunk in chunks if chunk.chunk_type == 'table'),
        "narrative_chunks": sum(1 for chunk in chunks if chunk.chunk_type == 'narrative'),
        "unique_sections": len(set(chunk.section_info for chunk in chunks))
    }

    return stats

# =============================================================================
# 9. LET'S TEST THIS!
# =============================================================================

print("🚀 SEC Filing Preprocessing Strategy - Ready for Testing!")
print("="*60)
print("Key improvements over original approach:")
print("✅ Multi-strategy section detection with fallbacks")
print("✅ Sentence-aware chunking with overlap")
print("✅ Robust error handling and logging")
print("✅ Structured data classes for better organization")
print("✅ Quality validation and statistics")
print("✅ Separate table and narrative processing")
print("="*60)


def test_single_file():
    """Test our preprocessing on a single file"""
    # Replace with an actual file path from your processed_filings directory
    test_file = "processed_filings/AAPL/AAPL_10K_2020-10-30.txt"

    if os.path.exists(test_file):
        print(f"🧪 Testing with: {test_file}")
        print("="*50)

        # Changed to universal processing function
        chunks = process_filing_robust_universal(test_file)
        stats = validate_chunks(chunks)

        print("📊 Processing Results:")
        for key, value in stats.items():
            print(f"  {key}: {value}")

        print("\n📝 Sample Chunks:")
        for i, chunk in enumerate(chunks[:3]):  # Show first 3 chunks
            print(f"\nChunk {i+1} ({chunk.chunk_type}):")
            print(f"  Section: {chunk.section_info}")
            print(f"  Tokens: {chunk.token_count}")
            print(f"  Text preview: {chunk.text[:200]}...")

        return chunks
    else:
        print(f"❌ File not found: {test_file}")
        print("Please update the file path to match your data structure")
        return []

# Run the test
chunks = test_single_file()

def compare_section_strategies(content: str): # Changed content_sample to content to use full content
    """Compare how different strategies perform"""
    print("🔍 Comparing Section Detection Strategies")
    print("="*50)

    # Strategy 1: Robust regex
    sections_1 = detect_sections_strategy_1_improved(content) # Changed content_sample to content
    print(f"Strategy 1 (Regex): {len(sections_1)} sections")
    for i, section in enumerate(sections_1[:5]):  # Show first 5
        print(f"  {i+1}. {section.title[:60]}...")

    print()

    # Strategy 2: Page-based fallback
    sections_2 = detect_sections_strategy_2(content) # Changed content_sample to content
    print(f"Strategy 2 (Page-based): {len(sections_2)} sections")
    for i, section in enumerate(sections_2[:5]):  # Show first 5
        print(f"  {i+1}. {section.title[:60]}...")

    return sections_1, sections_2

# Test if we have chunks from previous test
if chunks:
    # Use the first chunk's filing to get the full content
    test_file = chunks[0].filing_metadata.file_path
    with open(test_file, 'r', encoding='utf-8') as f:
        # Load full content for comparison, not just a sample
        full_content_for_comparison = f.read()
    cleaned_content_for_comparison = clean_sec_text(full_content_for_comparison) # Clean it for consistent comparison

    sections_1_comp, sections_2_comp = compare_section_strategies(cleaned_content_for_comparison)


def analyze_chunking_quality(chunks: List[Chunk]):
    """Deep dive into chunk quality"""
    if not chunks:
        print("No chunks to analyze")
        return

    print("📊 Chunking Quality Analysis")
    print("="*50)

    # Token distribution
    token_counts = [chunk.token_count for chunk in chunks]

    print(f"Token Distribution:")
    print(f"  Mean: {sum(token_counts)/len(token_counts):.1f}")
    print(f"  Median: {sorted(token_counts)[len(token_counts)//2]}")
    print(f"  Min: {min(token_counts)}")
    print(f"  Max: {max(token_counts)}")

    # Chunk types
    chunk_types = {}
    for chunk in chunks:
        chunk_types[chunk.chunk_type] = chunk_types.get(chunk.chunk_type, 0) + 1

    print(f"\nChunk Types:")
    for chunk_type, count in chunk_types.items():
        print(f"  {chunk_type}: {count}")

    # Section distribution
    sections_dist = {} # Renamed to avoid conflict with `sections` list
    for chunk in chunks:
        sections_dist[chunk.section_info] = sections_dist.get(chunk.section_info, 0) + 1

    print(f"\nSection Distribution:")
    for section, count in sorted(sections_dist.items()):
        print(f"  {section}: {count} chunks")

    # Overlap analysis
    overlap_count = sum(1 for chunk in chunks if chunk.has_overlap)
    print(f"\nOverlap Analysis:")
    print(f"  Chunks with overlap: {overlap_count}/{len(chunks)} ({overlap_count/len(chunks)*100:.1f}%)")

    return {
        'token_stats': {
            'mean': sum(token_counts)/len(token_counts),
            'median': sorted(token_counts)[len(token_counts)//2],
            'min': min(token_counts),
            'max': max(token_counts)
        },
        'chunk_types': chunk_types,
        'sections': sections_dist,
        'overlap_rate': overlap_count/len(chunks)
    }

# Analyze our test chunks
if chunks:
    quality_analysis = analyze_chunking_quality(chunks)


def test_chunking_parameters():
    """Test different parameter combinations"""
    if not chunks:
        print("No test file processed yet")
        return

    test_file = chunks[0].filing_metadata.file_path

    print("🔧 Testing Different Chunking Parameters")
    print("="*50)

    # Test different parameter combinations
    param_configs = [
        {"target_tokens": 300, "overlap_tokens": 50, "name": "Small chunks, low overlap"},
        {"target_tokens": 500, "overlap_tokens": 100, "name": "Medium chunks, medium overlap"},
        {"target_tokens": 800, "overlap_tokens": 150, "name": "Large chunks, high overlap"},
    ]

    results = {}

    for config in param_configs:
        print(f"\n🧪 Testing: {config['name']}")
        # Changed to universal processing function
        test_chunks = process_filing_robust_universal(
            test_file,
            target_tokens=config['target_tokens'],
            overlap_tokens=config['overlap_tokens']
        )

        stats = validate_chunks(test_chunks)
        results[config['name']] = stats

        print(f"  Total chunks: {stats['total_chunks']}")
        print(f"  Avg tokens: {stats['avg_tokens']:.1f}")
        print(f"  Overlap rate: {stats['chunks_with_overlap']}/{stats['total_chunks']}")

    return results

# Test different parameters
param_results = test_chunking_parameters()


def test_error_handling():
    """Test how our system handles various edge cases"""
    print("🛡️ Testing Error Handling")
    print("="*50)

    # Test 1: Non-existent file
    print("Test 1: Non-existent file")
    # Changed to universal processing function
    fake_chunks = process_filing_robust_universal("non_existent_file.txt")
    print(f"  Result: {len(fake_chunks)} chunks (expected 0)")

    # Test 2: Empty file
    print("\nTest 2: Empty content")
    empty_sections = detect_sections_robust_universal("") # Changed to universal detection
    print(f"  Result: {len(empty_sections)} sections")

    # Test 3: Malformed filename
    print("\nTest 3: Malformed filename")
    # Create a temporary file with bad name
    import tempfile
    with tempfile.NamedTemporaryFile(mode='w', suffix='_bad_name.txt', delete=False) as f:
        f.write("Some content")
        temp_file = f.name

    # Changed to universal processing function
    bad_chunks = process_filing_robust_universal(temp_file)
    print(f"  Result: {len(bad_chunks)} chunks (expected 0)")

    # Clean up
    os.unlink(temp_file)

    # Test 4: Very short text
    print("\nTest 4: Very short text")
    # This call is correct, as create_overlapping_chunks is a helper
    short_chunks = create_overlapping_chunks("Short text.", target_tokens=500)
    print(f"  Result: {len(short_chunks)} chunks")

test_error_handling()


def test_batch_processing(max_files: int = 5):
    """Test processing multiple files"""
    print(f"🔄 Testing Batch Processing (max {max_files} files)")
    print("="*50)

    data_path = "processed_filings/"
    if not os.path.exists(data_path):
        print(f"❌ Data path not found: {data_path}")
        return []

    # Get all files
    all_files = []
    for root, dirs, files in os.walk(data_path):
        for file in files:
            if file.endswith('.txt'):
                all_files.append(os.path.join(root, file))

    # Process a subset
    test_files = all_files[:max_files]
    print(f"Processing {len(test_files)} files...")

    all_results = []

    for i, file_path in enumerate(test_files):
        print(f"  {i+1}/{len(test_files)}: {os.path.basename(file_path)}")

        # Changed to universal processing function
        file_chunks = process_filing_robust_universal(file_path)
        stats = validate_chunks(file_chunks)

        all_results.append({
            'file': os.path.basename(file_path),
            'chunks': len(file_chunks),
            'avg_tokens': stats.get('avg_tokens', 0),
            'sections': stats.get('unique_sections', 0),
            'tables': stats.get('table_chunks', 0)
        })

    # Summary statistics
    print(f"\n📊 Batch Processing Summary:")
    total_chunks = sum(r['chunks'] for r in all_results)
    avg_chunks_per_file = total_chunks / len(all_results) if all_results else 0

    print(f"  Total files processed: {len(all_results)}\n")
    print(f"  Total chunks created: {total_chunks}")
    print(f"  Average chunks per file: {avg_chunks_per_file:.1f}")

    print(f"\n📋 Per-file results:")
    for result in all_results:
        print(f"  {result['file']}: {result['chunks']} chunks, {result['sections']} sections, {result['tables']} tables")

    return all_results

# Run batch test
batch_results = test_batch_processing(max_files=3)


def create_analysis_summary():
    """Create a comprehensive summary of our preprocessing"""
    print("📈 Final Analysis Summary")
    print("="*60)

    # Assumes 'chunks' variable from test_single_file() is available
    if 'chunks' not in globals() or not chunks:
        print("No chunks to analyze - run test_single_file() first")
        return

    # Create a mini dataset for analysis
    chunk_data = []
    for chunk in chunks:
        chunk_data.append({
            'chunk_id': chunk.chunk_id,
            'tokens': chunk.token_count,
            'type': chunk.chunk_type,
            'section': chunk.section_info,
            'has_overlap': chunk.has_overlap,
            'ticker': chunk.filing_metadata.ticker,
            'form_type': chunk.filing_metadata.form_type,
            'fiscal_year': chunk.filing_metadata.fiscal_year
        })

    df = pd.DataFrame(chunk_data)

    print("🎯 Key Insights:")
    print(f"  • Document: {df['ticker'].iloc[0]} {df['form_type'].iloc[0]} (FY{df['fiscal_year'].iloc[0]})")
    print(f"  • Total chunks: {len(df)}")
    print(f"  • Average chunk size: {df['tokens'].mean():.0f} tokens")
    print(f"  • Size range: {df['tokens'].min()} - {df['tokens'].max()} tokens")
    print(f"  • Overlap rate: {(df['has_overlap'].sum() / len(df) * 100):.1f}%")

    print(f"\n📊 Chunk Distribution by Type:")
    type_dist = df['type'].value_counts()
    for chunk_type, count in type_dist.items():
        percentage = (count / len(df)) * 100
        print(f"  • {chunk_type}: {count} chunks ({percentage:.1f}%)")

    print(f"\n📚 Section Breakdown:")
    section_dist = df['section'].value_counts()
    for section, count in section_dist.head(8).items():  # Top 8 sections
        print(f"  • {section}: {count} chunks")

    # Quality metrics
    print(f"\n✅ Quality Metrics:")

    # Check for very small chunks (potential issues)
    small_chunks = df[df['tokens'] < 50]
    print(f"  • Very small chunks (<50 tokens): {len(small_chunks)} ({len(small_chunks)/len(df)*100:.1f}%)")

    # Check for very large chunks (might need splitting)
    large_chunks = df[df['tokens'] > 800]
    print(f"  • Large chunks (>800 tokens): {len(large_chunks)} ({len(large_chunks)/len(df)*100:.1f}%)")

    # Check section coverage
    unique_sections = df['section'].nunique()
    print(f"  • Unique sections identified: {unique_sections}")

    # Show some example chunks for manual review
    print(f"\n🔍 Sample Chunks for Review:")

    # Show one of each type
    for chunk_type in df['type'].unique():
        sample = df[df['type'] == chunk_type].iloc[0]
        # Find the actual chunk object to get its full text
        chunk_obj = next(c for c in chunks if c.chunk_id == sample['chunk_id'])
        print(f"\n  {chunk_type.upper()} example ({sample['tokens']} tokens):\n")
        print(f"    Section: {sample['section']}\n")
        print(f"    Preview: {chunk_obj.text[:150]}...\n")

    return df

# Create final summary
summary_df = create_analysis_summary()


def compare_with_original():
    """Compare our approach with the original chunking strategy"""
    print("⚖️ Comparison: New vs Original Approach")
    print("="*60)

    improvements = [
        "✅ Multi-strategy section detection (fallbacks for robustness)",
        "✅ Sentence-aware chunking (preserves semantic boundaries)",
        "✅ Overlapping chunks (maintains context across boundaries)",
        "✅ Separate table processing (handles structured data better)",
        "✅ Comprehensive error handling (graceful degradation)",
        "✅ Rich metadata structure (better for search/filtering)",
        "✅ Quality validation (ensures chunk coherence)",
        "✅ Configurable parameters (tunable for different use cases)"
    ]

    potential_tradeoffs = [
        "⚠️ Slightly more complex code (but more maintainable)",
        "⚠️ More chunks due to overlap (but better retrieval)",
        "⚠️ Processing takes longer (but more robust results)"
    ]

    print("🚀 Key Improvements:")
    for improvement in improvements:
        print(f"  {improvement}")

    print(f"\n⚖️ Potential Tradeoffs:")
    for tradeoff in potential_tradeoffs:
        print(f"  {tradeoff}")

    print(f"\n🎯 Recommended Next Steps:")
    next_steps = [
        "1. Test on more diverse filings to validate robustness",
        "2. Fine-tune chunking parameters based on embedding performance",
        "3. Add semantic similarity checks between overlapping chunks",
        "4. Implement incremental processing for large datasets",
        "5. Add support for other SEC forms (8-K, DEF 14A, etc.)",
        "6. Create embedding quality metrics and evaluation"
    ]

    for step in next_steps:
        print(f"  {step}")

    print("\n" + "="*60)
    print("🎉 Preprocessing Strategy Testing Complete!")
    print("="*60)
    print("Next step: Convert this notebook into modular Python files")
    print("Then: Implement the embedding pipeline and MCP server!")
    print("="*60)

compare_with_original()

# Test functions adapted to _fixed suffix to avoid NameErrors from notebook re-runs
# Ensure these are called after all function definitions.
print("🚀 Ready to test universal SEC detection!")
print("\n1. Run test_universal_detection_fixed() to test all files")
print("2. Run compare_old_vs_universal_fixed() to see the improvement")
print("3. Run quick_pattern_test_fixed() to see what patterns match")

# Define the _fixed test functions so they are available when called below
def test_universal_detection_fixed():
    """Test the universal detection on all your file types"""

    test_files = [
        "processed_filings/AAPL/AAPL_10K_2020-10-30.txt",
        "processed_filings/AMZN/AMZN_10K_2023-02-03.txt",
        "processed_filings/AMZN/AMZN_10Q_2024-11-01.txt",
        "processed_filings/KO/KO_10Q_2020-07-22.txt"  # If you have this one
    ]

    results = {}

    for test_file in test_files:
        if not os.path.exists(test_file):
            print(f"⚠️ Skipping {test_file} - file not found\n")
            continue

        print(f"\n🧪 Testing: {test_file}\n")
        print("=" * 80)

        with open(test_file, 'r', encoding='utf-8') as f:
            content = f.read()

        # Test universal detection
        sections = detect_sections_robust_universal(content)

        print(f"\n✅ Found {len(sections)} sections:\n")
        for i, section in enumerate(sections[:10]):  # Show first 10
            print(f"  {i+1}. {section.title}\n")
            print(f"     Type: {section.section_type}, Length: {len(section.content):,} chars\n")

        # Test full pipeline
        chunks = process_filing_robust_universal(test_file)
        stats = validate_chunks(chunks) if chunks else {"error": "No chunks created"}

        results[test_file] = {
            'sections': len(sections),
            'chunks': len(chunks) if chunks else 0,
            'stats': stats
        }

        print(f"\n📊 Processing Results:\n")
        for key, value in stats.items():
            print(f"  {key}: {value}\n")

        if chunks:
            # Show section distribution
            section_counts = {}
            for chunk in chunks[:20]:  # Sample first 20
                section = chunk.section_info
                section_counts[section] = section_counts.get(section, 0) + 1

            print(f"\n📚 Section Distribution (sample):\n")
            for section, count in sorted(section_counts.items()):
                print(f"  • {section}: {count} chunks\n")

    # Summary comparison
    print(f"\n" + "="*80)
    print("📊 UNIVERSAL DETECTION SUMMARY")
    print("="*80)

    for file_path, result in results.items():
        filename = file_path.split('/')[-1]
        print(f"{filename:<25} | {result['sections']:>2} sections | {result['chunks']:>3} chunks\n")

    return results

def compare_old_vs_universal_fixed():
    """Compare the old detection vs universal detection"""
    test_file = "processed_filings/AAPL/AAPL_10K_2020-10-30.txt"

    if not os.path.exists(test_file):
        print("Test file not found for comparison\n")
        return

    print("⚖️ OLD vs UNIVERSAL Detection Comparison\n")
    print("="*60)

    with open(test_file, 'r', encoding='utf-8') as f:
        content = f.read()

    # Old detection
    print("Running old detection...\n")
    old_sections = detect_sections_robust_old(content) # Use detect_sections_robust_old

    # New universal detection
    print("Running universal detection...\n")
    new_sections = detect_sections_robust_universal(content)

    print(f"\n📊 Comparison Results:\n")
    print(f"  Old detection: {len(old_sections)} sections\n")
    print(f"  Universal detection: {len(new_sections)} sections\n")
    print(f"  Improvement: +{len(new_sections) - len(old_sections)} sections\n")

    print(f"\n📋 Old Sections:\n")
    for i, section in enumerate(old_sections):
        print(f"  {i+1}. {section.title}\n")

    print(f"\n📋 Universal Sections:\n")
    for i, section in enumerate(new_sections):
        print(f"  {i+1}. {section.title}\n")

    return old_sections, new_sections

def quick_pattern_test_fixed():
    """Quick test to see what patterns match in your content"""
    test_file = "processed_filings/AAPL/AAPL_10K_2020-10-30.txt"

    if not os.path.exists(test_file):
        print("Test file not found\n")
        return

    with open(test_file, 'r', encoding='utf-8') as f:
        content = f.read()

    print("🔍 QUICK PATTERN TEST\n")
    print("="*50)

    # Test key patterns
    patterns = [
        (re.compile(r'\[TABLE_START\](?:.|\n)*?Item(?:.|\n)*?\[TABLE_END\]', re.I | re.DOTALL), "Table-wrapped Items"),
        (re.compile(r'Item\s+\d+[A-C]?\.\s*\|', re.I), "Pipe-separated Items"),
        (re.compile(r'PART\s+[IVX]+', re.I), "Part headers"),
        (re.compile(r'\[TABLE_START\](?:.|\n)*?PART(?:.|\n)*?\[TABLE_END\]', re.I | re.DOTALL), "Table-wrapped Parts"),
    ]

    for compiled_pattern, description in patterns:
        matches = compiled_pattern.findall(content) # Use compiled pattern
        print(f"\n{description}: {len(matches)} matches\n")
        for i, match in enumerate(matches[:3]):
            # Clean up match for display
            clean_match = ' '.join(match.split())[:100]
            print(f"  {i+1}: {clean_match}...\n")

# Run the fixed tests
results_universal = test_universal_detection_fixed()
old_vs_new_sections = compare_old_vs_universal_fixed()
quick_pattern_test_fixed()

INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 0 unique sections:
ERROR:__main__:Error processing processed_filings/AAPL/AAPL_10K_2020-10-30.txt: unbalanced parenthesis at position 73
ERROR:__main__:Error processing non_existent_file.txt: Unknown datetime string format, unable to parse: file, at position 0
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:Empty content provided to detect_sections_universal_sec. Returning empty sections.
INFO:__main__:Empty content provided to detect_sections_from_toc_universal. Returning empty sections.
ERROR:__main__:Error processing /var/folders/pj/bmp5122d3d77bzq_cvf0wbl40000gn/T/tmpc3gm1vf0_bad_name.txt: Unknown datetime string format, unable to parse: name, at position 0
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 0 unique sections:
ERROR:__main__:Error processing processed_filings/AMZN/AMZN_10Q_2022-04-29.txt: un

🚀 SEC Filing Preprocessing Strategy - Ready for Testing!
Key improvements over original approach:
✅ Multi-strategy section detection with fallbacks
✅ Sentence-aware chunking with overlap
✅ Robust error handling and logging
✅ Structured data classes for better organization
✅ Quality validation and statistics
✅ Separate table and narrative processing
🧪 Testing with: processed_filings/AAPL/AAPL_10K_2020-10-30.txt
📊 Processing Results:
  error: No chunks created

📝 Sample Chunks:
No test file processed yet
🛡️ Testing Error Handling
Test 1: Non-existent file
  Result: 0 chunks (expected 0)

Test 2: Empty content
  Result: 1 sections

Test 3: Malformed filename
  Result: 0 chunks (expected 0)

Test 4: Very short text
  Result: 0 chunks
🔄 Testing Batch Processing (max 3 files)
Processing 3 files...
  1/3: AMZN_10Q_2022-04-29.txt
  2/3: AMZN_10Q_2020-05-01.txt
  3/3: AMZN_10Q_2020-10-30.txt

📊 Batch Processing Summary:
  Total files processed: 3

  Total chunks created: 0
  Average chunks per 

INFO:__main__:🔍 Universal SEC detection found 2 unique sections:
INFO:__main__:  1: Item/Part I - . FINANCIAL INFORMATION...
INFO:__main__:  2: Item/Part II - . OTHER INFORMATION...


error: unbalanced parenthesis at position 73

In [32]:
import os
import re
import pandas as pd
import tiktoken
from typing import List, Dict, Any, Tuple, Optional
from dataclasses import dataclass
from datetime import datetime
import logging
from pathlib import Path

# Set up logging to see what's happening
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Initialize tokenizer for accurate token counting
encoding = tiktoken.encoding_for_model("text-embedding-3-small")

# =============================================================================
# 1. SEC MAPPINGS WITH FALLBACKS
# =============================================================================

ITEM_NAME_MAP_10K = {
    "1": "Business",
    "1A": "Risk Factors",
    "1B": "Unresolved Staff Comments",
    "1C": "Cybersecurity",
    "2": "Properties",
    "3": "Legal Proceedings",
    "4": "Mine Safety Disclosures",
    "5": "Market for Registrant's Common Equity, Related Stockholder Matters and Issuer Purchases of Equity Securities",
    "6": "Reserved",
    "7": "Management's Discussion and Analysis of Financial Condition and Results of Operations",
    "7A": "Quantitative and Qualitative Disclosures About Market Risk",
    "8": "Financial Statements and Supplementary Data",
    "9": "Changes in and Disagreements With Accountants on Accounting and Financial Disclosure",
    "9A": "Controls and Procedures",
    "9B": "Other Information",
    "9C": "Disclosure Regarding Foreign Jurisdictions that Prevent Inspections",
    "10": "Directors, Executive Officers and Corporate Governance",
    "11": "Executive Compensation",
    "12": "Security Ownership of Certain Beneficial Owners and Management and Related Stockholder Matters",
    "13": "Certain Relationships and Related Transactions, and Director Independence",
    "14": "Principal Accountant Fees and Services",
    "15": "Exhibits, Financial Statement Schedules",
    "16": "Form 10-K Summary"
}

ITEM_NAME_MAP_10Q_PART_I = {
    "1": "Financial Statements",
    "2": "Management's Discussion and Analysis of Financial Condition and Results of Operations",
    "3": "Quantitative and Qualitative Disclosures About Market Risk",
    "4": "Controls and Procedures",
}

ITEM_NAME_MAP_10Q_PART_II = {
    "1": "Legal Proceedings", "1A": "Risk Factors",
    "2": "Unregistered Sales of Equity Securities and Use of Proceeds",
    "3": "Defaults Upon Senior Securities", "4": "Mine Safety Disclosures",
    "5": "Other Information", "6": "Exhibits",
}

# =============================================================================
# 2. DATA STRUCTURES FOR BETTER ORGANIZATION
# =============================================================================

@dataclass
class FilingMetadata:
    """Structured metadata for a filing"""
    ticker: str
    form_type: str
    filing_date: str
    fiscal_year: int
    fiscal_quarter: int
    file_path: str

@dataclass
class DocumentSection:
    """Represents a section of the document"""
    title: str
    content: str
    section_type: str  # 'item', 'part', 'intro', 'table'
    item_number: Optional[str] = None
    part: Optional[str] = None
    start_pos: int = 0
    end_pos: int = 0

@dataclass
class Chunk:
    """Final chunk with all metadata"""
    chunk_id: str
    text: str
    token_count: int
    chunk_type: str  # 'narrative', 'table', 'mixed'
    section_info: str
    filing_metadata: FilingMetadata
    chunk_index: int
    has_overlap: bool = False

# =============================================================================
# 3. ROBUST TEXT CLEANING
# =============================================================================

def clean_sec_text(text: str) -> str:
    """
    Clean SEC filing text more robustly
    """
    # Remove common SEC artifacts
    text = re.sub(r'UNITED STATES\s+SECURITIES AND EXCHANGE COMMISSION.*?FORM \d+[A-Z]*', '', text, flags=re.DOTALL | re.IGNORECASE)

    # Handle page breaks more intelligently
    text = text.replace('[PAGE BREAK]', '\n\n--- PAGE BREAK ---\n\n')

    # Preserve table boundaries but clean them up
    text = re.sub(r'\[TABLE_START\]', '\n\n=== TABLE START ===\n', text)
    text = re.sub(r'\[TABLE_END\]', '\n=== TABLE END ===\n\n', text)

    # Clean up excessive whitespace but preserve paragraph structure
    text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text)  # Multiple newlines -> double newline
    text = re.sub(r'[ \t]+', ' ', text)  # Multiple spaces/tabs -> single space
    text = re.sub(r'^\s+|\s+$', '', text, flags=re.MULTILINE)  # Trim lines

    return text.strip()

# =============================================================================
# 4. MULTI-STRATEGY SECTION DETECTION
# =============================================================================

def detect_sections_strategy_1_improved(content: str) -> List[DocumentSection]:
    """
    Improved Strategy 1: Patterns based on real SEC filing structure
    """
    sections = []

    # Much more comprehensive patterns based on your actual files
    patterns = [
        # PART patterns - handle various formats
        re.compile(r'^\s*PART\s+([IVX]+)(?:\s*[-–—].*?)?$', re.I | re.M),
        re.compile(r'^PART\s+([IVX]+)(?:\s*[-–—].*?)?$', re.I | re.M),

        # ITEM patterns - much more flexible
        re.compile(r'^\s*ITEM\s+(\d{1,2}[A-C]?)(?:[.\s–—])', re.I | re.M),
        re.compile(r'^ITEM\s+(\d{1,2}[A-C]?)(?:[.\s–—])', re.I | re.M),
        re.compile(r'Item\s+(\d{1,2}[A-C]?)(?:[.\s–—])', re.I | re.M),

        # Number-dot format common in SEC filings
        re.compile(r'^(\d{1,2}[A-C]?)\.\s+[A-Z][A-Za-z\s]{10,}', re.I | re.M),

        # Content-based patterns for known sections
        re.compile(r'^.{0,50}(BUSINESS)\s*$', re.I | re.M),
        re.compile(r'^.{0,50}(RISK FACTORS)\s*$', re.I | re.M),
        re.compile(r'^.{0,50}(LEGAL PROCEEDINGS)\s*$', re.I | re.M),
        re.compile(r'^.{0,50}(FINANCIAL STATEMENTS)\s*$', re.I | re.M),
        re.compile(r'^.{0,50}(MANAGEMENT.S DISCUSSION)\s*', re.I | re.M),
        re.compile(r'^.{0,50}(PROPERTIES)\s*$', re.I | re.M),
        re.compile(r'^.{0,50}(CONTROLS AND PROCEDURES)\s*$', re.I | re.M),
    ]

    all_matches = []

    # Process each pattern
    for pattern_idx, pattern in enumerate(patterns):
        for match in pattern.finditer(content): # Use pre-compiled pattern
            # Get the full line containing this match
            line_start = content.rfind('\n', 0, match.start()) + 1
            line_end = content.find('\n', match.end())
            if line_end == -1:
                line_end = len(content)

            full_line = content[line_start:line_end].strip()

            # Filter out obvious false positives
            if (len(full_line) > 400 or  # Too long to be a header
                len(full_line) < 3 or    # Too short
                '|' in full_line or      # Likely table content
                full_line.count(' ') > 20):  # Too many words
                continue

            # Extract section identifier
            section_id = match.group(1) if match.groups() else 'unknown'

            all_matches.append({
                'start_pos': line_start, # Changed from match.start() for consistency with line-based detection
                'end_pos': line_end,     # Changed from match.end()
                'full_line': full_line,
                'section_id': section_id,
                'pattern_idx': pattern_idx,
                'match_start': match.start()
            })

    # Remove duplicates - matches within 200 characters of each other
    unique_matches = []
    for match in sorted(all_matches, key=lambda x: x['start_pos']):
        is_duplicate = any(
            abs(match['start_pos'] - existing['start_pos']) < 200
            for existing in unique_matches
        )
        if not is_duplicate:
            unique_matches.append(match)

    # Debug output
    print(f"🔍 Improved detection found {len(unique_matches)} potential sections:")
    for i, match in enumerate(unique_matches[:15]):  # Show more for debugging
        print(f"  {i+1}: {match['full_line'][:80]}...")

    # Convert to DocumentSection objects
    for i, match in enumerate(unique_matches):
        start_pos = match['start_pos']
        end_pos = unique_matches[i + 1]['start_pos'] if i + 1 < len(unique_matches) else len(content)

        section_content = content[start_pos:end_pos].strip()

        # Determine section type and metadata
        full_line_upper = match['full_line'].upper()
        section_id = match['section_id'].upper() if match['section_id'] != 'unknown' else None

        if 'PART' in full_line_upper and section_id:
            section_type = 'part'
            part = f"PART {section_id}"
            item_number = None
            title = f"Part {section_id}"
        elif ('ITEM' in full_line_upper or re.match(r'^\d+[A-C]?$', str(section_id))) and section_id:
            section_type = 'item'
            part = None
            item_number = section_id
            title = f"Item {section_id}"
        elif any(keyword in full_line_upper for keyword in
                ['BUSINESS', 'RISK', 'LEGAL', 'FINANCIAL', 'MANAGEMENT', 'PROPERTIES', 'CONTROLS']):
            section_type = 'named_section'
            part = None
            item_number = None
            title = match['full_line']
        else:
            section_type = 'content'
            part = None
            item_number = None
            title = match['full_line']

        sections.append(DocumentSection(
            title=title,
            content=section_content,
            section_type=section_type,
            item_number=item_number,
            part=part,
            start_pos=start_pos,
            end_pos=end_pos
        ))

    return sections

def detect_sections_strategy_2(content: str) -> List[DocumentSection]:
    """
    Strategy 2: Fallback using page breaks and heuristics
    """
    sections = []

    # Split by page breaks first
    pages = content.split('--- PAGE BREAK ---')

    current_section = ""
    current_title = "Document Content"

    for i, page in enumerate(pages):
        page = page.strip()
        if not page:
            continue

        # Look for section headers in the page
        lines = page.split('\n')
        potential_headers = []

        for j, line in enumerate(lines[:10]):  # Check first 10 lines of each page
            line = line.strip()
            if (len(line) < 100 and  # Headers are usually short
                (re.search(r'\b(ITEM|PART)\b', line, re.IGNORECASE) or
                 re.search(r'\b(BUSINESS|RISK FACTORS|FINANCIAL STATEMENTS)\b', line, re.IGNORECASE))):
                potential_headers.append((j, line))

        if potential_headers:
            # Found a header, start new section
            if current_section:
                sections.append(DocumentSection(
                    title=current_title,
                    content=current_section.strip(),
                    section_type='content',
                    start_pos=0,
                    end_pos=len(current_section)
                ))

            current_title = potential_headers[0][1]
            current_section = page
        else:
            # Continue current section
            current_section += "\n\n" + page

    # Add the last section
    if current_section:
        sections.append(DocumentSection(
            title=current_title,
            content=current_section.strip(),
            section_type='content',
            start_pos=0,
            end_pos=len(current_section)
        ))

    return sections

# The `detect_sections_robust` function from your original code (renamed detect_sections_robust_old to avoid conflict)
def detect_sections_robust_old(content: str) -> List[DocumentSection]:
    """
    Multi-strategy section detection with fallbacks (original version)
    """
    logger.info("Attempting Strategy 1: Regex-based section detection")
    sections = detect_sections_strategy_1_improved(content) # Original called detect_sections_strategy_1, updated to _improved

    if len(sections) >= 3:  # A reasonable number of sections to consider it successful
        logger.info(f"Strategy 1 successful: Found {len(sections)} sections")
        return sections

    logger.warning("Strategy 1 failed, trying Strategy 2: Page-based detection")
    sections = detect_sections_strategy_2(content)

    if len(sections) >= 2:
        logger.info(f"Strategy 2 successful: Found {len(sections)} sections")
        return sections

    logger.warning("All strategies failed, creating single section")
    return [DocumentSection(
        title="Full Document",
        content=content,
        section_type='document',
        start_pos=0,
        end_pos=len(content)
    )]

def create_section_info(section: DocumentSection, form_type: str) -> str:
    """
    Create human-readable section information
    """
    if section.section_type == 'item' and section.item_number:
        if form_type == '10K':
            item_name = ITEM_NAME_MAP_10K.get(section.item_number, "Unknown Section")
            return f"Item {section.item_number} - {item_name}"
        elif form_type == '10Q':
            # Determine which part this item belongs to
            if section.item_number in ITEM_NAME_MAP_10Q_PART_I:
                item_name = ITEM_NAME_MAP_10Q_PART_I[section.item_number]
                return f"Part I, Item {section.item_number} - {item_name}"
            else:
                item_name = ITEM_NAME_MAP_10Q_PART_II.get(section.item_number, "Unknown Section")
                return f"Part II, Item {section.item_number} - {item_name}"

    elif section.section_type == 'part' and section.part:
        return section.part

    else:
        return section.title or "Document Content"

def detect_sections_universal_sec(content: str) -> List[DocumentSection]:
    """
    Universal section detection for SEC filings with table-based formatting
    """
    sections = []

    if not content: # Added check for empty content
        logger.info("Empty content provided to detect_sections_universal_sec. Returning empty sections.")
        return sections

    # Universal patterns for table-formatted SEC filings
    # Using re.escape for literal brackets, and compiling patterns once.
    # Replaced '\[\[...\]\]' with re.escape('[...]') as it's more standard and robust.
    patterns = [
        re.compile(re.escape('[TABLE_START]') + r'.*?Item\s+(\d{1,2}[A-C]?)\.\s*\|\s*([^\[]+?)\s*' + re.escape('[TABLE_END]'), re.I | re.DOTALL),
        re.compile(re.escape('[TABLE_START]') + r'.*?Item\s+(\d{1,2}[A-C]?)\.\s*\|\s*([^|]+)', re.I | re.DOTALL),

        re.compile(re.escape('[TABLE_START]') + r'.*?PART\s+([IVX]+)\s*\|\s*([^\[]+?)\s*' + re.escape('[TABLE_END]'), re.I | re.DOTALL),
        re.compile(re.escape('[TABLE_START]') + r'.*?PART\s+([IVX]+)\s*\|\s*([^|]+)', re.I | re.DOTALL),
        re.compile(re.escape('[TABLE_START]') + r'.*?PART\s+([IVX]+)\s*' + re.escape('[TABLE_END]'), re.I | re.DOTALL),

        re.compile(r'^\s*Item\s+(\d{1,2}[A-C]?)\.\s*([^\n]+)', re.I | re.M),
        re.compile(r'Item\s+(\d{1,2}[A-C]?)\.\s*\|\s*([^|]+)', re.I | re.DOTALL),

        re.compile(r'^\s*PART\s+([IVX]+)\s*([^\n]*)', re.I | re.M),
        re.compile(r'PART\s+([IVX]+)\s*\|\s*([^|]+)', re.I | re.DOTALL),

        re.compile(re.escape('[TABLE_START]') + r'.*?(\d{1,2}[A-C]?)\.\s*\|\s*([^|]+)', re.I | re.DOTALL),

        re.compile(r'^(Item\s+\d{1,2}[A-C]?\.\s+[^|]+?)$', re.I | re.M),
        re.compile(r'^(PART\s+[IVX]+)(?:\s*[-–—]\s*(.+))?$', re.I | re.M),
    ]

    all_matches = []

    for pattern_idx, pattern in enumerate(patterns):
        for match in pattern.finditer(content): # Use pre-compiled pattern
            # Get context around the match (full line)
            line_start = content.rfind('\n', 0, match.start()) + 1
            line_end = content.find('\n', match.end())
            if line_end == -1:
                line_end = len(content)

            full_line = content[line_start:line_end].strip()

            # Filter out obvious false positives
            if (len(full_line) > 400 or  # Too long to be a header
                len(full_line) < 3 or    # Too short
                '|' in full_line or      # Likely table content
                full_line.count(' ') > 20):  # Too many words
                continue

            # Extract section information
            groups = match.groups()

            section_id = groups[0].strip() if groups else 'unknown'
            section_title = ""

            if len(groups) >= 2 and groups[1]:
                section_title = groups[1].strip()
                # Clean up section title from table markers - use actual markers here
                section_title = re.sub(re.escape('[TABLE_END]') + r'.*', '', section_title, flags=re.I).strip()
                section_title = section_title.replace('|', '').strip()
            elif len(groups) == 1:
                line_after_id_match = content[match.end():].split('\n')[0].strip()
                if line_after_id_match:
                    section_title = line_after_id_match
                else:
                    section_title = f"Section {section_id}"
            else:
                section_title = full_line

            all_matches.append({
                'start_pos': match.start(),
                'end_pos': match.end(),
                'full_line': full_line,
                'section_id': section_id,
                'section_title': section_title,
                'pattern_idx': pattern_idx,
                'match_start': match.start()
            })

    # Sort matches by their start position
    all_matches.sort(key=lambda x: x['start_pos'])

    # Remove duplicates and prioritize 'better' matches
    final_matches = []
    if all_matches:
        final_matches.append(all_matches[0])
        for i in range(1, len(all_matches)):
            current_match = all_matches[i]
            last_added_match = final_matches[-1]

            if current_match['start_pos'] - last_added_match['start_pos'] > 150:
                final_matches.append(current_match)
            elif current_match['start_pos'] - last_added_match['start_pos'] < 50:
                if last_added_match['section_id'] == 'unknown' and current_match['section_id'] != 'unknown':
                    final_matches[-1] = current_match
                elif last_added_match['section_id'] == current_match['section_id'] and last_added_match['pattern_idx'] > current_match['pattern_idx']:
                    final_matches[-1] = current_match

    logger.info(f"🔍 Universal SEC detection found {len(final_matches)} unique sections:")
    for i, match in enumerate(final_matches[:15]):
        logger.info(f"  {i+1}: Item/Part {match['section_id']} - {match['section_title'][:60]}...")

    # Convert to DocumentSection objects
    final_document_sections = []
    for i, match in enumerate(final_matches):
        start_pos = match['start_pos']
        end_pos = final_matches[i + 1]['start_pos'] if i + 1 < len(final_matches) else len(content)

        section_content = content[start_pos:end_pos].strip()

        section_id = match['section_id'].upper()
        title = match['section_title']

        section_type = 'content'
        item_number = None
        part = None

        if re.match(r'^[IVX]+$', section_id):
            section_type = 'part'
            part = f"PART {section_id}"
            if title.upper().startswith("PART ") and title.upper().replace("PART ", "").strip() == section_id:
                title = part
            elif not title:
                 title = part
        elif re.match(r'^\d+[A-C]?$', section_id):
            section_type = 'item'
            item_number = section_id
            if title.upper().startswith("ITEM ") and title.upper().replace("ITEM ", "").strip() == section_id:
                title = f"Item {item_number}"
            elif not title:
                 title = f"Item {item_number}"

        final_document_sections.append(DocumentSection(
            title=title,
            content=section_content,
            section_type=section_type,
            item_number=item_number,
            part=part,
            start_pos=start_pos,
            end_pos=end_pos
        ))

    return final_document_sections

def detect_sections_from_toc_universal(content: str) -> List[DocumentSection]:
    """
    Extract sections from table of contents - works for any SEC filing.
    This function primarily identifies section titles and item numbers from TOC,
    but does not extract their content directly.
    """
    sections = []

    if not content:
        logger.info("Empty content provided to detect_sections_from_toc_universal. Returning empty sections.")
        return sections

    # Look for table of contents patterns
    # Using re.escape for literal brackets and compiling patterns once.
    toc_patterns = [
        re.compile(r'(?i)INDEX.*?(?=\s*--- PAGE BREAK ---)', re.DOTALL), # Simpler page break match
        re.compile(r'(?i)TABLE OF CONTENTS.*?(?=\s*--- PAGE BREAK ---)', re.DOTALL), # Simpler page break match
        re.compile(r'(?i)FORM 10-[KQ].*?INDEX.*?(?=\s*--- PAGE BREAK ---)', re.DOTALL), # Simpler page break match
        re.compile(re.escape('[TABLE_START]') + r'.*?Page.*?' + re.escape('[TABLE_END]') + r'.*?(?=\s*--- PAGE BREAK ---)', re.DOTALL), # Simpler page break match
    ]

    toc_content = ""
    for pattern in toc_patterns:
        match = pattern.search(content)
        if match:
            toc_content = match.group(0)
            break

    if not toc_content:
        logger.warning("No table of contents found in detect_sections_from_toc_universal.")
        return sections

    logger.info(f"Found table of contents ({len(toc_content)} chars)")

    # Define patterns for items/parts within the TOC
    # Using re.escape for literal brackets and compiling patterns once.
    item_patterns = [
        re.compile(r'(?i)Item\\s+(\\d{1,2}[A-C]?)\\.\\s*\\|\\s*([^|]+?)\\s*\\|\\s*\\d+', re.DOTALL),
        re.compile(r'(?i)PART\\s+([IVX]+)\\s*\\|\\s*([^|]+)', re.DOTALL),
        re.compile(r'(?i)Item\\s+(\\d{1,2}[A-C]?)\\.\\s+([^|\\d]+)', re.M),
        re.compile(r'(?i)(\\d{1,2}[A-C]?)\\.\\s*\\|\\s*([^|]+?)\\s*\\|\\s*\\d+', re.DOTALL),
        re.compile(r'(?i)PART\\s+([IVX]+)', re.M)
    ]

    found_items = []
    for pattern in item_patterns:
        for match in pattern.finditer(toc_content):
            groups = match.groups()
            if len(groups) >= 2:
                item_id = groups[0].strip()
                item_title = groups[1].strip()
                item_title = re.sub(r'\\s+', ' ', item_title)
                found_items.append((item_id, item_title))
            elif len(groups) == 1:
                item_id = groups[0].strip()
                line_after_id_match = toc_content[match.end():].split('\n')[0].strip()
                if line_after_id_match:
                    item_title = line_after_id_match
                else:
                    item_title = f"Section {item_id}"
                item_title = re.sub(r'\\s+', ' ', item_title)
                found_items.append((item_id, item_title))

    unique_items = []
    seen = set()
    for item_id, title in found_items:
        key = f"{item_id}_{title[:50]}"
        if key not in seen:
            unique_items.append((item_id, title))
            seen.add(key)

    logger.info(f"Extracted {len(unique_items)} sections from table of contents:")
    for item_id, title in unique_items[:10]:
        logger.info(f"  • {item_id}: {title[:50]}...")

    toc_sections = []
    for item_id, title in unique_items:
        section_type = 'unknown'
        item_number = None
        part_num = None

        if re.match(r'^\d+[A-C]?$', item_id):
            section_type = 'item'
            item_number = item_id
        elif re.match(r'^[IVX]+$', item_id):
            section_type = 'part'
            part_num = item_id

        toc_sections.append(DocumentSection(
            title=title,
            content="",
            section_type=section_type,
            item_number=item_number,
            part=part_num
        ))
    return toc_sections


def detect_sections_robust_universal(content: str) -> List[DocumentSection]:
    """
    Universal robust section detection for all SEC filings.
    Prioritizes direct pattern matching (which handles tables well), then TOC, then page-based.
    """
    logger.info("Attempting universal SEC section detection")

    # Strategy 1: Direct pattern matching for sections (designed to work well with common SEC patterns)
    sections_strategy1 = detect_sections_universal_sec(content)

    if len(sections_strategy1) >= 3:
        logger.info(f"Universal detection successful (Strategy 1): Found {len(sections_strategy1)} sections.")
        return sections_strategy1

    # Strategy 2: Try parsing Table of Contents.
    logger.warning("Direct detection found few sections, analyzing table of contents.")
    toc_entries = detect_sections_from_toc_universal(content)

    if toc_entries and len(toc_entries) >= 3:
        logger.info(f"TOC analysis found {len(toc_entries)} potential sections. Attempting to extract content based on TOC titles.")

        combined_sections = []
        current_content_pos = 0

        for i, toc_entry in enumerate(toc_entries):
            search_title = re.escape(toc_entry.title).replace('\\ ', '\\s*')

            if toc_entry.item_number:
                search_pattern = re.compile(r'(?i)^\s*(?:Item\s*' + re.escape(toc_entry.item_number) + r'|' + search_title + r')', re.M)
            elif toc_entry.part:
                search_pattern = re.compile(r'(?i)^\s*(?:PART\s*' + re.escape(toc_entry.part) + r'|' + search_title + r')', re.M)
            else:
                search_pattern = re.compile(r'(?i)^\s*' + search_title, re.M)

            match = search_pattern.search(content, pos=current_content_pos)

            if match:
                start_pos = match.start()

                next_start_pos = len(content)
                if i + 1 < len(toc_entries):
                    next_toc_entry_title = re.escape(toc_entries[i+1].title).replace('\\ ', '\\s*')
                    if toc_entries[i+1].item_number:
                        next_pattern = re.compile(r'(?i)^\s*(?:Item\s*' + re.escape(toc_entries[i+1].item_number) + r'|' + next_toc_entry_title + r')', re.M)
                    elif toc_entries[i+1].part:
                        next_pattern = re.compile(r'(?i)^\s*(?:PART\s*' + re.escape(toc_entries[i+1].part) + r'|' + next_toc_entry_title + r')', re.M)
                    else:
                        next_pattern = re.compile(r'(?i)^\s*' + next_toc_entry_title, re.M)

                    # Search for the next section from the end of the current section's match
                    next_match = next_pattern.search(content, pos=match.end()) # Search from end of current match
                    if next_match:
                        next_start_pos = next_match.start()

                section_content = content[start_pos:next_start_pos].strip()

                combined_sections.append(DocumentSection(
                    title=toc_entry.title,
                    content=section_content,
                    section_type=toc_entry.section_type,
                    item_number=toc_entry.item_number,
                    part=toc_entry.part,
                    start_pos=start_pos,
                    end_pos=next_start_pos
                ))
                current_content_pos = next_start_pos
            else:
                logger.warning(f"Could not find content for TOC entry: {toc_entry.title}. This section might be merged with previous or skipped.")
                # This could also be a sub-section of a larger item not directly represented in primary TOC.
                # For now, it will effectively be skipped if its start isn't found.

        if len(combined_sections) >= 3:
            logger.info(f"Universal detection successful (TOC-based content mapping): Found {len(combined_sections)} sections.")
            return combined_sections
        else:
            logger.warning("TOC-based content mapping yielded few sections. Falling back to page-based detection.")


    # Strategy 3: Page-based fallback (original strategy 2)
    logger.warning("Trying page-based detection as fallback.")
    sections_strategy2 = detect_sections_strategy_2(content)

    if len(sections_strategy2) >= 2:
        logger.info(f"Page-based detection successful: Found {len(sections_strategy2)} sections.")
        return sections_strategy2

    # Final fallback: return the entire document as a single section
    logger.warning("All strategies failed, creating single section.")
    return [DocumentSection(
        title="Full Document",
        content=content,
        section_type='document',
        start_pos=0,
        end_pos=len(content)
    )]


# Helper function to extract metadata from filename
def extract_metadata_from_filename(file_path: str) -> FilingMetadata:
    filename = Path(file_path).name
    file_id = filename.replace(".txt", "")
    parts = file_id.split('_')

    if len(parts) != 3:
        logger.warning(f"Malformed filename: {filename}. Using default metadata.")
        return FilingMetadata(
            ticker="UNKNOWN",
            form_type="UNKNOWN",
            filing_date="1900-01-01",
            fiscal_year=1900,
            fiscal_quarter=1,
            file_path=file_path
        )

    ticker, form_type, filing_date_str = parts

    try:
        filing_date = pd.to_datetime(filing_date_str)
        fiscal_year = filing_date.year
        fiscal_quarter = filing_date.quarter
    except pd.errors.ParserError:
        logger.error(f"Could not parse filing date from {filing_date_str} in {filename}. Using default values.")
        fiscal_year = 1900
        fiscal_quarter = 1

    # Adjust fiscal year for 10-K filings if the filing date is early in the calendar year
    # and typically refers to the previous fiscal year end.
    if form_type == '10K' and filing_date.month <= 3: # Assuming fiscal year ends typically in Dec or Jan-Mar for previous year
        fiscal_year -= 1 # Often a 10K filed in Jan-Mar of current year is for previous fiscal year

    return FilingMetadata(
        ticker=ticker,
        form_type=form_type,
        filing_date=filing_date_str,
        fiscal_year=fiscal_year,
        fiscal_quarter=fiscal_quarter,
        file_path=file_path
    )


# =============================================================================
# MAIN PROCESSING FUNCTION (Universal)
# =============================================================================
def process_filing_robust_universal(file_path: str, target_tokens: int = 500, overlap_tokens: int = 100) -> List[Chunk]:
    """
    Universal processing function for all SEC filings
    """
    try:
        # Extract filing metadata
        filing_metadata = extract_metadata_from_filename(file_path)
        filename = Path(file_path).name # For logging clarity
        file_id = filename.replace(".txt", "") # For chunk_id creation

        # Read and clean content
        with open(file_path, 'r', encoding='utf-8') as f:
            raw_content = f.read()
        cleaned_content = clean_sec_text(raw_content)

        # Basic check for empty content after cleaning
        if not cleaned_content.strip():
            logger.warning(f"Cleaned content for {filename} is empty. No chunks created.")
            return []

        # Use universal section detection
        sections = detect_sections_robust_universal(cleaned_content)
        logger.info(f"Found {len(sections)} sections in {filename}")

        # Process each section
        all_chunks = []
        chunk_counter = 0

        for section in sections:
            # Ensure section.content is not empty before processing
            if not section.content.strip():
                continue # Skip empty sections

            # Extract tables and narrative from this section's content
            tables_in_section, narrative_content_in_section = extract_and_process_tables(section.content)

            # Create section info string using the original create_section_info
            section_info = create_section_info(section, filing_metadata.form_type)

            # Process tables found within this section
            for table in tables_in_section:
                chunk = Chunk(
                    chunk_id=f"{file_id}-chunk-{chunk_counter:04d}",
                    text=table['text'],
                    token_count=table['token_count'],
                    chunk_type='table',
                    section_info=section_info,
                    filing_metadata=filing_metadata,
                    chunk_index=chunk_counter,
                    has_overlap=False
                )
                all_chunks.append(chunk)
                chunk_counter += 1

            # Process narrative content from this section
            if narrative_content_in_section.strip():
                # Use the existing create_overlapping_chunks for narrative
                narrative_sub_chunks = create_overlapping_chunks(
                    narrative_content_in_section, target_tokens, overlap_tokens
                )

                for chunk_data in narrative_sub_chunks:
                    chunk = Chunk(
                        chunk_id=f"{file_id}-chunk-{chunk_counter:04d}",
                        text=chunk_data['text'],
                        token_count=chunk_data['token_count'],
                        chunk_type='narrative',
                        section_info=section_info,
                        filing_metadata=filing_metadata,
                        chunk_index=chunk_counter,
                        has_overlap=chunk_data['has_overlap']
                    )
                    all_chunks.append(chunk)
                    chunk_counter += 1

        logger.info(f"Created {len(all_chunks)} chunks for {filename}")
        return all_chunks

    except Exception as e:
        logger.error(f"Error processing {file_path}: {e}")
        return []

# =============================================================================
# 5. IMPROVED SENTENCE-AWARE CHUNKING
# =============================================================================

def split_into_sentences(text: str) -> List[str]:
    """
    Split text into sentences using multiple heuristics
    """
    # Simple sentence splitting (can be improved with spaCy/NLTK)
    sentences = re.split(r'(?<=[.!?])\s+(?=[A-Z])', text)

    # Clean up sentences
    sentences = [s.strip() for s in sentences if s.strip()]

    return sentences

def create_overlapping_chunks(text: str, target_tokens: int = 500, overlap_tokens: int = 100,
                            min_tokens: int = 50) -> List[Dict[str, Any]]:
    """
    Create semantically aware chunks with overlap
    """
    sentences = split_into_sentences(text)
    chunks = []

    current_chunk_sentences = []
    current_tokens = 0

    for i, sentence in enumerate(sentences):
        sentence_tokens = len(encoding.encode(sentence))

        # If adding this sentence exceeds target, finalize current chunk
        if current_tokens + sentence_tokens > target_tokens and current_chunk_sentences:
            chunk_text = ' '.join(current_chunk_sentences)
            chunks.append({
                'text': chunk_text,
                'token_count': current_tokens,
                'sentence_count': len(current_chunk_sentences),
                'has_overlap': len(chunks) > 0
            })

            # Create overlap: keep last few sentences
            overlap_sentences = []
            current_overlap_tokens = 0 # Renamed variable to avoid conflict with function parameter 'overlap_tokens'

            # Add sentences from the end until we reach overlap target
            # Ensure we don't go past the start of the chunk
            for sent_idx in range(len(current_chunk_sentences) - 1, -1, -1):
                sent = current_chunk_sentences[sent_idx]
                sent_tokens = len(encoding.encode(sent))
                if current_overlap_tokens + sent_tokens <= overlap_tokens:
                    overlap_sentences.insert(0, sent)
                    current_overlap_tokens += sent_tokens
                else:
                    break
            
            # If after trying to create overlap, we still don't have enough tokens for overlap
            # (e.g., first few sentences are very long), just take some minimal content.
            if not overlap_sentences and current_chunk_sentences:
                # Fallback to last sentence if no other overlap possible and current chunk exists
                overlap_sentences = [current_chunk_sentences[-1]]
                current_overlap_tokens = len(encoding.encode(overlap_sentences[0]))


            # Start new chunk with overlap + current sentence
            current_chunk_sentences = overlap_sentences + [sentence]
            current_tokens = current_overlap_tokens + sentence_tokens
        else:
            # Add sentence to current chunk
            current_chunk_sentences.append(sentence)
            current_tokens += sentence_tokens

    # Add final chunk if it has content
    if current_chunk_sentences:
        chunk_text = ' '.join(current_chunk_sentences)
        final_tokens = len(encoding.encode(chunk_text))

        if final_tokens >= min_tokens:
            chunks.append({
                'text': chunk_text,
                'token_count': final_tokens,
                'sentence_count': len(current_chunk_sentences),
                'has_overlap': len(chunks) > 0
            })

    return chunks

# =============================================================================
# 6. TABLE HANDLING
# =============================================================================

def extract_and_process_tables(content: str) -> Tuple[List[Dict], str]:
    """
    Extract tables and return both table chunks and narrative text
    """
    table_pattern = re.compile(r'=== TABLE START ===.*?=== TABLE END ===', re.DOTALL)
    tables = []

    # Find all tables
    for i, match in enumerate(table_pattern.finditer(content)):
        table_content = match.group(0)
        # Clean table markers
        table_text = table_content.replace('=== TABLE START ===', '').replace('=== TABLE END ===', '').strip()

        if table_text:  # Only add non-empty tables
            tables.append({
                'text': table_text,
                'token_count': len(encoding.encode(table_text)),
                'table_index': i,
                'chunk_type': 'table'
            })

    # Remove tables from content to get narrative text
    narrative_content = table_pattern.sub('', content).strip()

    return tables, narrative_content

# =============================================================================
# 8. TESTING AND VALIDATION
# =============================================================================

def validate_chunks(chunks: List[Chunk]) -> Dict[str, Any]:
    """
    Validate the quality of our chunks
    """
    if not chunks:
        return {"error": "No chunks created"}

    token_counts = [chunk.token_count for chunk in chunks]

    stats = {
        "total_chunks": len(chunks),
        "avg_tokens": sum(token_counts) / len(token_counts),
        "min_tokens": min(token_counts),
        "max_tokens": max(token_counts),
        "chunks_with_overlap": sum(1 for chunk in chunks if chunk.has_overlap),
        "table_chunks": sum(1 for chunk in chunks if chunk.chunk_type == 'table'),
        "narrative_chunks": sum(1 for chunk in chunks if chunk.chunk_type == 'narrative'),
        "unique_sections": len(set(chunk.section_info for chunk in chunks))
    }

    return stats

# =============================================================================
# 9. LET'S TEST THIS!
# =============================================================================

print("🚀 SEC Filing Preprocessing Strategy - Ready for Testing!\n")
print("="*60)
print("Key improvements over original approach:\n")
print("✅ Multi-strategy section detection with fallbacks\n")
print("✅ Sentence-aware chunking with overlap\n")
print("✅ Robust error handling and logging\n")
print("✅ Structured data classes for better organization\n")
print("✅ Quality validation and statistics\n")
print("✅ Separate table and narrative processing\n")
print("="*60)


def test_single_file():
    """Test our preprocessing on a single file"""
    # Replace with an actual file path from your processed_filings directory
    test_file = "processed_filings/AAPL/AAPL_10K_2020-10-30.txt"

    if os.path.exists(test_file):
        print(f"🧪 Testing with: {test_file}\n")
        print("="*50)

        # Changed to universal processing function
        chunks = process_filing_robust_universal(test_file)
        stats = validate_chunks(chunks)

        print("📊 Processing Results:\n")
        for key, value in stats.items():
            print(f"  {key}: {value}\n")

        print("\n📝 Sample Chunks:\n")
        for i, chunk in enumerate(chunks[:3]):  # Show first 3 chunks
            print(f"\nChunk {i+1} ({chunk.chunk_type}):\n")
            print(f"  Section: {chunk.section_info}\n")
            print(f"  Tokens: {chunk.token_count}\n")
            print(f"  Text preview: {chunk.text[:200]}...\n")

        return chunks
    else:
        print(f"❌ File not found: {test_file}\n")
        print("Please update the file path to match your data structure\n")
        return []

# Run the test
chunks = test_single_file()

def compare_section_strategies(content: str): # Changed content_sample to content to use full content
    """Compare how different strategies perform"""
    print("🔍 Comparing Section Detection Strategies\n")
    print("="*50)

    # Strategy 1: Robust regex
    sections_1 = detect_sections_strategy_1_improved(content) # Changed content_sample to content
    print(f"Strategy 1 (Regex): {len(sections_1)} sections\n")
    for i, section in enumerate(sections_1[:5]):  # Show first 5
        print(f"  {i+1}. {section.title[:60]}...\n")

    print()

    # Strategy 2: Page-based fallback
    sections_2 = detect_sections_strategy_2(content) # Changed content_sample to content
    print(f"Strategy 2 (Page-based): {len(sections_2)} sections\n")
    for i, section in enumerate(sections_2[:5]):  # Show first 5
        print(f"  {i+1}. {section.title[:60]}...\n")

    return sections_1, sections_2

# Test if we have chunks from previous test
if chunks:
    # Use the first chunk's filing to get the full content
    test_file = chunks[0].filing_metadata.file_path
    with open(test_file, 'r', encoding='utf-8') as f:
        # Load full content for comparison, not just a sample
        full_content_for_comparison = f.read()
    cleaned_content_for_comparison = clean_sec_text(full_content_for_comparison) # Clean it for consistent comparison

    sections_1_comp, sections_2_comp = compare_section_strategies(cleaned_content_for_comparison)


def analyze_chunking_quality(chunks: List[Chunk]):
    """Deep dive into chunk quality"""
    if not chunks:
        print("No chunks to analyze\n")
        return

    print("📊 Chunking Quality Analysis\n")
    print("="*50)

    # Token distribution
    token_counts = [chunk.token_count for chunk in chunks]

    print(f"Token Distribution:\n")
    print(f"  Mean: {sum(token_counts)/len(token_counts):.1f}\n")
    print(f"  Median: {sorted(token_counts)[len(token_counts)//2]}\n")
    print(f"  Min: {min(token_counts)}\n")
    print(f"  Max: {max(token_counts)}\n")

    # Chunk types
    chunk_types = {}
    for chunk in chunks:
        chunk_types[chunk.chunk_type] = chunk_types.get(chunk.chunk_type, 0) + 1

    print(f"\nChunk Types:\n")
    for chunk_type, count in chunk_types.items():
        print(f"  {chunk_type}: {count}\n")

    # Section distribution
    sections_dist = {} # Renamed to avoid conflict with `sections` list
    for chunk in chunks:
        sections_dist[chunk.section_info] = sections_dist.get(chunk.section_info, 0) + 1

    print(f"\nSection Distribution:\n")
    for section, count in sorted(sections_dist.items()):
        print(f"  {section}: {count} chunks\n")

    # Overlap analysis
    overlap_count = sum(1 for chunk in chunks if chunk.has_overlap)
    print(f"\nOverlap Analysis:\n")
    print(f"  Chunks with overlap: {overlap_count}/{len(chunks)} ({overlap_count/len(chunks)*100:.1f}%)\n")

    return {
        'token_stats': {
            'mean': sum(token_counts)/len(token_counts),
            'median': sorted(token_counts)[len(token_counts)//2],
            'min': min(token_counts),
            'max': max(token_counts)
        },
        'chunk_types': chunk_types,
        'sections': sections_dist,
        'overlap_rate': overlap_count/len(chunks)
    }

# Analyze our test chunks
if chunks:
    quality_analysis = analyze_chunking_quality(chunks)


def test_chunking_parameters():
    """Test different parameter combinations"""
    if not chunks:
        print("No test file processed yet\n")
        return

    test_file = chunks[0].filing_metadata.file_path

    print("🔧 Testing Different Chunking Parameters\n")
    print("="*50)

    # Test different parameter combinations
    param_configs = [
        {"target_tokens": 300, "overlap_tokens": 50, "name": "Small chunks, low overlap"},
        {"target_tokens": 500, "overlap_tokens": 100, "name": "Medium chunks, medium overlap"},
        {"target_tokens": 800, "overlap_tokens": 150, "name": "Large chunks, high overlap"},
    ]

    results = {}

    for config in param_configs:
        print(f"\n🧪 Testing: {config['name']}\n")
        # Changed to universal processing function
        test_chunks = process_filing_robust_universal(
            test_file,
            target_tokens=config['target_tokens'],
            overlap_tokens=config['overlap_tokens']
        )

        stats = validate_chunks(test_chunks)
        results[config['name']] = stats

        print(f"  Total chunks: {stats['total_chunks']}\n")
        print(f"  Avg tokens: {stats['avg_tokens']:.1f}\n")
        print(f"  Overlap rate: {stats['chunks_with_overlap']}/{stats['total_chunks']}\n")

    return results

# Test different parameters
param_results = test_chunking_parameters()


def test_error_handling():
    """Test how our system handles various edge cases"""
    print("🛡️ Testing Error Handling\n")
    print("="*50)

    # Test 1: Non-existent file
    print("Test 1: Non-existent file\n")
    # Changed to universal processing function
    fake_chunks = process_filing_robust_universal("non_existent_file.txt")
    print(f"  Result: {len(fake_chunks)} chunks (expected 0)\n")

    # Test 2: Empty file
    print("\nTest 2: Empty content\n")
    empty_sections = detect_sections_robust_universal("") # Changed to universal detection
    print(f"  Result: {len(empty_sections)} sections\n")

    # Test 3: Malformed filename
    print("\nTest 3: Malformed filename\n")
    # Create a temporary file with bad name
    import tempfile
    with tempfile.NamedTemporaryFile(mode='w', suffix='_bad_name.txt', delete=False) as f:
        f.write("Some content")
        temp_file = f.name

    # Changed to universal processing function
    bad_chunks = process_filing_robust_universal(temp_file)
    print(f"  Result: {len(bad_chunks)} chunks (expected 0)\n")

    # Clean up
    os.unlink(temp_file)

    # Test 4: Very short text
    print("\nTest 4: Very short text\n")
    # This call is correct, as create_overlapping_chunks is a helper
    short_chunks = create_overlapping_chunks("Short text.", target_tokens=500)
    print(f"  Result: {len(short_chunks)} chunks\n")

test_error_handling()


def test_batch_processing(max_files: int = 5):
    """Test processing multiple files"""
    print(f"🔄 Testing Batch Processing (max {max_files} files)\n")
    print("="*50)

    data_path = "processed_filings/"
    if not os.path.exists(data_path):
        print(f"❌ Data path not found: {data_path}\n")
        return []

    # Get all files
    all_files = []
    for root, dirs, files in os.walk(data_path):
        for file in files:
            if file.endswith('.txt'):
                all_files.append(os.path.join(root, file))

    # Process a subset
    test_files = all_files[:max_files]
    print(f"Processing {len(test_files)} files...\n")

    all_results = []

    for i, file_path in enumerate(test_files):
        print(f"  {i+1}/{len(test_files)}: {os.path.basename(file_path)}\n")

        # Changed to universal processing function
        file_chunks = process_filing_robust_universal(file_path)
        stats = validate_chunks(file_chunks)

        all_results.append({
            'file': os.path.basename(file_path),
            'chunks': len(file_chunks),
            'avg_tokens': stats.get('avg_tokens', 0),
            'sections': stats.get('unique_sections', 0),
            'tables': stats.get('table_chunks', 0)
        })

    # Summary statistics
    print(f"\n📊 Batch Processing Summary:\n")
    total_chunks = sum(r['chunks'] for r in all_results)
    avg_chunks_per_file = total_chunks / len(all_results) if all_results else 0

    print(f"  Total files processed: {len(all_results)}\n")
    print(f"  Total chunks created: {total_chunks}\n")
    print(f"  Average chunks per file: {avg_chunks_per_file:.1f}\n")

    print(f"\n📋 Per-file results:\n")
    for result in all_results:
        print(f"  {result['file']}: {result['chunks']} chunks, {result['sections']} sections, {result['tables']} tables\n")

    return all_results

# Run batch test
batch_results = test_batch_processing(max_files=3)


def create_analysis_summary():
    """Create a comprehensive summary of our preprocessing"""
    print("📈 Final Analysis Summary\n")
    print("="*60)

    # Assumes 'chunks' variable from test_single_file() is available
    if 'chunks' not in globals() or not chunks:
        print("No chunks to analyze - run test_single_file() first\n")
        return

    # Create a mini dataset for analysis
    chunk_data = []
    for chunk in chunks:
        chunk_data.append({
            'chunk_id': chunk.chunk_id,
            'tokens': chunk.token_count,
            'type': chunk.chunk_type,
            'section': chunk.section_info,
            'has_overlap': chunk.has_overlap,
            'ticker': chunk.filing_metadata.ticker,
            'form_type': chunk.filing_metadata.form_type,
            'fiscal_year': chunk.filing_metadata.fiscal_year
        })

    df = pd.DataFrame(chunk_data)

    print("🎯 Key Insights:\n")
    print(f"  • Document: {df['ticker'].iloc[0]} {df['form_type'].iloc[0]} (FY{df['fiscal_year'].iloc[0]})\n")
    print(f"  • Total chunks: {len(df)}\n")
    print(f"  • Average chunk size: {df['tokens'].mean():.0f} tokens\n")
    print(f"  • Size range: {df['tokens'].min()} - {df['tokens'].max()} tokens\n")
    print(f"  • Overlap rate: {(df['has_overlap'].sum() / len(df) * 100):.1f}%\n")

    print(f"\n📊 Chunk Distribution by Type:\n")
    type_dist = df['type'].value_counts()
    for chunk_type, count in type_dist.items():
        percentage = (count / len(df)) * 100
        print(f"  • {chunk_type}: {count} chunks ({percentage:.1f}%)\n")

    print(f"\n📚 Section Breakdown:\n")
    section_dist = df['section'].value_counts()
    for section, count in section_dist.head(8).items():  # Top 8 sections
        print(f"  • {section}: {count} chunks\n")

    # Quality metrics
    print(f"\n✅ Quality Metrics:\n")

    # Check for very small chunks (potential issues)
    small_chunks = df[df['tokens'] < 50]
    print(f"  • Very small chunks (<50 tokens): {len(small_chunks)} ({len(small_chunks)/len(df)*100:.1f}%)\n")

    # Check for very large chunks (might need splitting)
    large_chunks = df[df['tokens'] > 800]
    print(f"  • Large chunks (>800 tokens): {len(large_chunks)} ({len(large_chunks)/len(df)*100:.1f}%)\n")

    # Check section coverage
    unique_sections = df['section'].nunique()
    print(f"  • Unique sections identified: {unique_sections}\n")

    # Show some example chunks for manual review
    print(f"\n🔍 Sample Chunks for Review:\n")

    # Show one of each type
    for chunk_type in df['type'].unique():
        sample = df[df['type'] == chunk_type].iloc[0]
        # Find the actual chunk object to get its full text
        chunk_obj = next(c for c in chunks if c.chunk_id == sample['chunk_id'])
        print(f"\n  {chunk_type.upper()} example ({sample['tokens']} tokens):\n")
        print(f"    Section: {sample['section']}\n")
        print(f"    Preview: {chunk_obj.text[:150]}...\n")

    return df

# Create final summary
summary_df = create_analysis_summary()


def compare_with_original():
    """Compare our approach with the original chunking strategy"""
    print("⚖️ Comparison: New vs Original Approach\n")
    print("="*60)

    improvements = [
        "✅ Multi-strategy section detection (fallbacks for robustness)",
        "✅ Sentence-aware chunking (preserves semantic boundaries)",
        "✅ Overlapping chunks (maintains context across boundaries)",
        "✅ Separate table processing (handles structured data better)",
        "✅ Comprehensive error handling (graceful degradation)",
        "✅ Rich metadata structure (better for search/filtering)",
        "✅ Quality validation (ensures chunk coherence)",
        "✅ Configurable parameters (tunable for different use cases)"
    ]

    potential_tradeoffs = [
        "⚠️ Slightly more complex code (but more maintainable)",
        "⚠️ More chunks due to overlap (but better retrieval)",
        "⚠️ Processing takes longer (but more robust results)"
    ]

    print("🚀 Key Improvements:\n")
    for improvement in improvements:
        print(f"  {improvement}\n")

    print(f"\n⚖️ Potential Tradeoffs:\n")
    for tradeoff in potential_tradeoffs:
        print(f"  {tradeoff}\n")

    print(f"\n🎯 Recommended Next Steps:\n")
    next_steps = [
        "1. Test on more diverse filings to validate robustness",
        "2. Fine-tune chunking parameters based on embedding performance",
        "3. Add semantic similarity checks between overlapping chunks",
        "4. Implement incremental processing for large datasets",
        "5. Add support for other SEC forms (8-K, DEF 14A, etc.)",
        "6. Create embedding quality metrics and evaluation"
    ]

    for step in next_steps:
        print(f"  {step}\n")

    print("\n" + "="*60)
    print("🎉 Preprocessing Strategy Testing Complete!\n")
    print("="*60)
    print("Next step: Convert this notebook into modular Python files\n")
    print("Then: Implement the embedding pipeline and MCP server!\n")
    print("="*60)

compare_with_original()

# Test functions adapted to _fixed suffix to avoid NameErrors from notebook re-runs
# Ensure these are called after all function definitions.
print("🚀 Ready to test universal SEC detection!\n")
print("\n1. Run test_universal_detection_fixed() to test all files\n")
print("2. Run compare_old_vs_universal_fixed() to see the improvement\n")
print("3. Run quick_pattern_test_fixed() to see what patterns match\n")

# Define the _fixed test functions so they are available when called below
def test_universal_detection_fixed():
    """Test the universal detection on all your file types"""

    test_files = [
        "processed_filings/AAPL/AAPL_10K_2020-10-30.txt",
        "processed_filings/AMZN/AMZN_10K_2023-02-03.txt",
        "processed_filings/AMZN/AMZN_10Q_2024-11-01.txt", # This file name is in the future based on current date
        "processed_filings/KO/KO_10Q_2020-07-22.txt"
    ]

    results = {}

    for test_file in test_files:
        if not os.path.exists(test_file):
            print(f"⚠️ Skipping {test_file} - file not found\n")
            continue

        print(f"\n🧪 Testing: {test_file}\n")
        print("=" * 80)

        with open(test_file, 'r', encoding='utf-8') as f:
            content = f.read()

        # Test universal detection
        sections = detect_sections_robust_universal(content)

        print(f"\n✅ Found {len(sections)} sections:\n")
        for i, section in enumerate(sections[:10]):
            print(f"  {i+1}. {section.title}\n")
            print(f"     Type: {section.section_type}, Length: {len(section.content):,} chars\n")

        # Test full pipeline
        chunks = process_filing_robust_universal(test_file)
        stats = validate_chunks(chunks) if chunks else {"error": "No chunks created"}

        results[test_file] = {
            'sections': len(sections),
            'chunks': len(chunks) if chunks else 0,
            'stats': stats
        }

        print(f"\n📊 Processing Results:\n")
        for key, value in stats.items():
            print(f"  {key}: {value}\n")

        if chunks:
            section_counts = {}
            for chunk in chunks[:20]:
                section = chunk.section_info
                section_counts[section] = section_counts.get(section, 0) + 1

            print(f"\n📚 Section Distribution (sample):\n")
            for section, count in sorted(section_counts.items()):
                print(f"  • {section}: {count} chunks\n")

    print(f"\n" + "="*80)
    print("📊 UNIVERSAL DETECTION SUMMARY\n")
    print("="*80)

    for file_path, result in results.items():
        filename = file_path.split('/')[-1]
        print(f"{filename:<25} | {result['sections']:>2} sections | {result['chunks']:>3} chunks\n")

    return results

def compare_old_vs_universal_fixed():
    """Compare the old detection vs universal detection"""
    test_file = "processed_filings/AAPL/AAPL_10K_2020-10-30.txt"

    if not os.path.exists(test_file):
        print("Test file not found for comparison\n")
        return

    print("⚖️ OLD vs UNIVERSAL Detection Comparison\n")
    print("="*60)

    with open(test_file, 'r', encoding='utf-8') as f:
        content = f.read()

    print("Running old detection...\n")
    old_sections = detect_sections_robust_old(content)

    print("Running universal detection...\n")
    new_sections = detect_sections_robust_universal(content)

    print(f"\n📊 Comparison Results:\n")
    print(f"  Old detection: {len(old_sections)} sections\n")
    print(f"  Universal detection: {len(new_sections)} sections\n")
    print(f"  Improvement: +{len(new_sections) - len(old_sections)} sections\n")

    print(f"\n📋 Old Sections:\n")
    for i, section in enumerate(old_sections):
        print(f"  {i+1}. {section.title}\n")

    print(f"\n📋 Universal Sections:\n")
    for i, section in enumerate(new_sections):
        print(f"  {i+1}. {section.title}\n")

    return old_sections, new_sections

def quick_pattern_test_fixed():
    """Quick test to see what patterns match in your content"""
    test_file = "processed_filings/AAPL/AAPL_10K_2020-10-30.txt"

    if not os.path.exists(test_file):
        print("Test file not found\n")
        return

    print("🔍 QUICK PATTERN TEST\n")
    print("="*50)

    with open(test_file, 'r', encoding='utf-8') as f:
        content = f.read()

    patterns = [
        (re.compile(r'\[TABLE_START\](?:.|\n)*?Item(?:.|\n)*?\[TABLE_END\]', re.I | re.DOTALL), "Table-wrapped Items"),
        (re.compile(r'Item\s+\d+[A-C]?\.\s*\|', re.I), "Pipe-separated Items"),
        (re.compile(r'PART\s+[IVX]+', re.I), "Part headers"),
        (re.compile(r'\[TABLE_START\](?:.|\n)*?PART(?:.|\n)*?\[TABLE_END\]', re.I | re.DOTALL), "Table-wrapped Parts"),
    ]

    for compiled_pattern, description in patterns:
        matches = compiled_pattern.findall(content)
        print(f"\n{description}: {len(matches)} matches\n")
        for i, match in enumerate(matches[:3]):
            clean_match = ' '.join(match.split())[:100]
            print(f"  {i+1}: {clean_match}...\n")

# Run the fixed tests
results_universal = test_universal_detection_fixed()
old_vs_new_sections = compare_old_vs_universal_fixed()
quick_pattern_test_fixed()

INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 0 unique sections:
INFO:__main__:Found table of contents (1367 chars)
INFO:__main__:Extracted 0 sections from table of contents:
INFO:__main__:Found 1 sections in AAPL_10K_2020-10-30.txt
INFO:__main__:Created 172 chunks for AAPL_10K_2020-10-30.txt
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 0 unique sections:
INFO:__main__:Found table of contents (1367 chars)
INFO:__main__:Extracted 0 sections from table of contents:
INFO:__main__:Found 1 sections in AAPL_10K_2020-10-30.txt
INFO:__main__:Created 262 chunks for AAPL_10K_2020-10-30.txt
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 0 unique sections:
INFO:__main__:Found table of contents (1367 chars)
INFO:__main__:Extracted 0 sections from table of contents:
INFO:__main__:Found 1 sections in AAPL_10K_2020-10-30.txt


🚀 SEC Filing Preprocessing Strategy - Ready for Testing!

Key improvements over original approach:

✅ Multi-strategy section detection with fallbacks

✅ Sentence-aware chunking with overlap

✅ Robust error handling and logging

✅ Structured data classes for better organization

✅ Quality validation and statistics

✅ Separate table and narrative processing

🧪 Testing with: processed_filings/AAPL/AAPL_10K_2020-10-30.txt

📊 Processing Results:

  total_chunks: 172

  avg_tokens: 379.86046511627904

  min_tokens: 38

  max_tokens: 1692

  chunks_with_overlap: 105

  table_chunks: 66

  narrative_chunks: 106

  unique_sections: 1


📝 Sample Chunks:


Chunk 1 (table):

  Section: Full Document

  Tokens: 58

  Text preview: California | 94-2404110 | (State or other jurisdiction | of incorporation or organization) | (I.R.S. Employer Identification No.) | One Apple Park Way | Cupertino | , | California | 95014 | (Address o...


Chunk 2 (table):

  Section: Full Document

  Tokens: 240

  Text 

INFO:__main__:Created 172 chunks for AAPL_10K_2020-10-30.txt
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 0 unique sections:
INFO:__main__:Found table of contents (1367 chars)
INFO:__main__:Extracted 0 sections from table of contents:
INFO:__main__:Found 1 sections in AAPL_10K_2020-10-30.txt
INFO:__main__:Created 127 chunks for AAPL_10K_2020-10-30.txt
ERROR:__main__:Error processing non_existent_file.txt: Unknown datetime string format, unable to parse: file, at position 0
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:Empty content provided to detect_sections_universal_sec. Returning empty sections.
INFO:__main__:Empty content provided to detect_sections_from_toc_universal. Returning empty sections.
ERROR:__main__:Error processing /var/folders/pj/bmp5122d3d77bzq_cvf0wbl40000gn/T/tmpum01nfi7_bad_name.txt: Unknown datetime string format, unable to parse: name, at position 0
INFO:__main__:Attempting univers

  Total chunks: 172

  Avg tokens: 379.9

  Overlap rate: 105/172


🧪 Testing: Large chunks, high overlap

  Total chunks: 127

  Avg tokens: 495.8

  Overlap rate: 60/127

🛡️ Testing Error Handling

Test 1: Non-existent file

  Result: 0 chunks (expected 0)


Test 2: Empty content

  Result: 1 sections


Test 3: Malformed filename

  Result: 0 chunks (expected 0)


Test 4: Very short text

  Result: 0 chunks

🔄 Testing Batch Processing (max 3 files)

Processing 3 files...

  1/3: AMZN_10Q_2022-04-29.txt

  2/3: AMZN_10Q_2020-05-01.txt

  3/3: AMZN_10Q_2020-10-30.txt



INFO:__main__:Created 120 chunks for AMZN_10Q_2020-10-30.txt
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 17 unique sections:
INFO:__main__:  1: Item/Part I - Item 1.    Business...
INFO:__main__:  2: Item/Part 1A - Risk Factors...
INFO:__main__:  3: Item/Part 1B - Unresolved Staff Comments...
INFO:__main__:  4: Item/Part 3 - Legal Proceedings...
INFO:__main__:  5: Item/Part 4 - Mine Safety Disclosures...
INFO:__main__:  6: Item/Part 6 - Selected Financial Data...
INFO:__main__:  7: Item/Part 7 - Management’s Discussion and Analysis of Financial Condition ...
INFO:__main__:  8: Item/Part 7A - Quantitative and Qualitative Disclosures About Market Risk...
INFO:__main__:  9: Item/Part 8 - Financial Statements and Supplementary Data...
INFO:__main__:  10: Item/Part 9 - Changes in and Disagreements with Accountants on Accounting ...
INFO:__main__:  11: Item/Part 9B - Other Information...
INFO:__main__:  12: Item/Part 11 - Executive C


📊 Batch Processing Summary:

  Total files processed: 3

  Total chunks created: 440

  Average chunks per file: 146.7


📋 Per-file results:

  AMZN_10Q_2022-04-29.txt: 125 chunks, 1 sections, 51 tables

  AMZN_10Q_2020-05-01.txt: 195 chunks, 1 sections, 131 tables

  AMZN_10Q_2020-10-30.txt: 120 chunks, 1 sections, 48 tables

📈 Final Analysis Summary

🎯 Key Insights:

  • Document: AAPL 10K (FY2020)

  • Total chunks: 172

  • Average chunk size: 380 tokens

  • Size range: 38 - 1692 tokens

  • Overlap rate: 61.0%


📊 Chunk Distribution by Type:

  • narrative: 106 chunks (61.6%)

  • table: 66 chunks (38.4%)


📚 Section Breakdown:

  • Full Document: 172 chunks


✅ Quality Metrics:

  • Very small chunks (<50 tokens): 2 (1.2%)

  • Large chunks (>800 tokens): 3 (1.7%)

  • Unique sections identified: 1


🔍 Sample Chunks for Review:


  TABLE example (58 tokens):

    Section: Full Document

    Preview: California | 94-2404110 | (State or other jurisdiction | of incorporation or or

INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 0 unique sections:
INFO:__main__:Found table of contents (1367 chars)
INFO:__main__:Extracted 0 sections from table of contents:
INFO:__main__:Found 1 sections in AAPL_10K_2020-10-30.txt
INFO:__main__:Created 172 chunks for AAPL_10K_2020-10-30.txt
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 4 unique sections:
INFO:__main__:  1: Item/Part I - [TABLE_START]...
INFO:__main__:  2: Item/Part II - [TABLE_START]...
INFO:__main__:  3: Item/Part III - [TABLE_START]...
INFO:__main__:  4: Item/Part IV - [TABLE_START]...
INFO:__main__:Universal detection successful (Strategy 1): Found 4 sections.
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 0 unique sections:
INFO:__main__:Found table of contents (1441 chars)
INFO:__main__:Extracted 0 sections from table of contents:
INFO:__main__:


📊 Processing Results:

  total_chunks: 172

  avg_tokens: 379.86046511627904

  min_tokens: 38

  max_tokens: 1692

  chunks_with_overlap: 105

  table_chunks: 66

  narrative_chunks: 106

  unique_sections: 1


📚 Section Distribution (sample):

  • Full Document: 20 chunks


🧪 Testing: processed_filings/AMZN/AMZN_10K_2023-02-03.txt


✅ Found 4 sections:

  1. [TABLE_START]

     Type: part, Length: 71,104 chars

  2. [TABLE_START]

     Type: part, Length: 189,316 chars

  3. [TABLE_START]

     Type: part, Length: 2,224 chars

  4. [TABLE_START]

     Type: part, Length: 10,492 chars



INFO:__main__:Created 210 chunks for AMZN_10K_2023-02-03.txt
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 2 unique sections:
INFO:__main__:  1: Item/Part I - . FINANCIAL INFORMATION...
INFO:__main__:  2: Item/Part II - . OTHER INFORMATION...



📊 Processing Results:

  total_chunks: 210

  avg_tokens: 332.1666666666667

  min_tokens: 6

  max_tokens: 1157

  chunks_with_overlap: 119

  table_chunks: 90

  narrative_chunks: 120

  unique_sections: 1


📚 Section Distribution (sample):

  • Full Document: 20 chunks


🧪 Testing: processed_filings/AMZN/AMZN_10Q_2024-11-01.txt



INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 0 unique sections:
INFO:__main__:Found table of contents (903 chars)
INFO:__main__:Extracted 0 sections from table of contents:
INFO:__main__:Found 1 sections in AMZN_10Q_2024-11-01.txt
INFO:__main__:Created 132 chunks for AMZN_10Q_2024-11-01.txt
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 8 unique sections:



✅ Found 1 sections:

  1. Full Document

     Type: document, Length: 187,951 chars


📊 Processing Results:

  total_chunks: 132

  avg_tokens: 366.43939393939394

  min_tokens: 7

  max_tokens: 1548

  chunks_with_overlap: 81

  table_chunks: 50

  narrative_chunks: 82

  unique_sections: 1


📚 Section Distribution (sample):

  • Full Document: 20 chunks


🧪 Testing: processed_filings/KO/KO_10Q_2020-07-22.txt



INFO:__main__:  1: Item/Part I - . Financial Information...
INFO:__main__:  2: Item/Part 2 - Management's Discussion and Analysis of Financial Condition ...
INFO:__main__:  3: Item/Part 3 - Quantitative and Qualitative Disclosures About Market Risk...
INFO:__main__:  4: Item/Part 4 - Controls and Procedures...
INFO:__main__:  5: Item/Part II - . Other Information...
INFO:__main__:  6: Item/Part 1A - Risk Factors...
INFO:__main__:  7: Item/Part 2 - Unregistered Sales of Equity Securities and Use of Proceeds...
INFO:__main__:  8: Item/Part 6 - Exhibits...
INFO:__main__:Universal detection successful (Strategy 1): Found 8 sections.
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 0 unique sections:
INFO:__main__:Found table of contents (5004 chars)
INFO:__main__:Extracted 0 sections from table of contents:
INFO:__main__:Found 1 sections in KO_10Q_2020-07-22.txt
INFO:__main__:Created 161 chunks for KO_10Q_2020-07-22.txt
INFO:__main__:At


✅ Found 8 sections:

  1. . Financial Information

     Type: part, Length: 115,924 chars

  2. Management's Discussion and Analysis of Financial Condition and Results of Operations

     Type: item, Length: 87,923 chars

  3. Quantitative and Qualitative Disclosures About Market Risk

     Type: item, Length: 207 chars

  4. Controls and Procedures

     Type: item, Length: 1,004 chars

  5. . Other Information

     Type: part, Length: 248 chars

  6. Risk Factors

     Type: item, Length: 11,661 chars

  7. Unregistered Sales of Equity Securities and Use of Proceeds

     Type: item, Length: 2,127 chars

  8. Exhibits

     Type: item, Length: 13,918 chars


📊 Processing Results:

  total_chunks: 161

  avg_tokens: 396.7577639751553

  min_tokens: 32

  max_tokens: 1451

  chunks_with_overlap: 97

  table_chunks: 63

  narrative_chunks: 98

  unique_sections: 1


📚 Section Distribution (sample):

  • Full Document: 20 chunks


📊 UNIVERSAL DETECTION SUMMARY

AAPL_10K_2020-10-30.txt 

INFO:__main__:🔍 Universal SEC detection found 17 unique sections:
INFO:__main__:  1: Item/Part I - Item 1.    Business...
INFO:__main__:  2: Item/Part 1A - Risk Factors...
INFO:__main__:  3: Item/Part 1B - Unresolved Staff Comments...
INFO:__main__:  4: Item/Part 3 - Legal Proceedings...
INFO:__main__:  5: Item/Part 4 - Mine Safety Disclosures...
INFO:__main__:  6: Item/Part 6 - Selected Financial Data...
INFO:__main__:  7: Item/Part 7 - Management’s Discussion and Analysis of Financial Condition ...
INFO:__main__:  8: Item/Part 7A - Quantitative and Qualitative Disclosures About Market Risk...
INFO:__main__:  9: Item/Part 8 - Financial Statements and Supplementary Data...
INFO:__main__:  10: Item/Part 9 - Changes in and Disagreements with Accountants on Accounting ...
INFO:__main__:  11: Item/Part 9B - Other Information...
INFO:__main__:  12: Item/Part 11 - Executive Compensation...
INFO:__main__:  13: Item/Part 12 - Security Ownership of Certain Beneficial Owners and Manageme...
INFO


📊 Comparison Results:

  Old detection: 19 sections

  Universal detection: 17 sections

  Improvement: +-2 sections


📋 Old Sections:

  1. Part I

  2. Item 1A

  3. Item 1B

  4. Item 3

  5. Item 4

  6. Item 6

  7. Item 7

  8. Item 7A

  9. Item 8

  10. Notes to Consolidated Financial Statements

  11. Opinion on the Financial Statements

  12. Item 9

  13. Item 9B

  14. Item 11

  15. Item 12

  16. Item 13

  17. Item 14

  18. Part IV

  19. Item 16


📋 Universal Sections:

  1. Item 1.    Business

  2. Risk Factors

  3. Unresolved Staff Comments

  4. Legal Proceedings

  5. Mine Safety Disclosures

  6. Selected Financial Data

  7. Management’s Discussion and Analysis of Financial Condition and Results of Operations

  8. Quantitative and Qualitative Disclosures About Market Risk

  9. Financial Statements and Supplementary Data

  10. Changes in and Disagreements with Accountants on Accounting and Financial Disclosure

  11. Other Information

  12. Executive Compen

In [33]:
import os
import re
import pandas as pd
import tiktoken
from typing import List, Dict, Any, Tuple, Optional
from dataclasses import dataclass
from datetime import datetime
import logging
from pathlib import Path

# Set up logging to see what's happening
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Initialize tokenizer for accurate token counting
encoding = tiktoken.encoding_for_model("text-embedding-3-small")

# =============================================================================
# 1. SEC MAPPINGS WITH FALLBACKS
# =============================================================================

ITEM_NAME_MAP_10K = {
    "1": "Business",
    "1A": "Risk Factors",
    "1B": "Unresolved Staff Comments",
    "1C": "Cybersecurity",
    "2": "Properties",
    "3": "Legal Proceedings",
    "4": "Mine Safety Disclosures",
    "5": "Market for Registrant's Common Equity, Related Stockholder Matters and Issuer Purchases of Equity Securities",
    "6": "Reserved",
    "7": "Management's Discussion and Analysis of Financial Condition and Results of Operations",
    "7A": "Quantitative and Qualitative Disclosures About Market Risk",
    "8": "Financial Statements and Supplementary Data",
    "9": "Changes in and Disagreements With Accountants on Accounting and Financial Disclosure",
    "9A": "Controls and Procedures",
    "9B": "Other Information",
    "9C": "Disclosure Regarding Foreign Jurisdictions that Prevent Inspections",
    "10": "Directors, Executive Officers and Corporate Governance",
    "11": "Executive Compensation",
    "12": "Security Ownership of Certain Beneficial Owners and Management and Related Stockholder Matters",
    "13": "Certain Relationships and Related Transactions, and Director Independence",
    "14": "Principal Accountant Fees and Services",
    "15": "Exhibits, Financial Statement Schedules",
    "16": "Form 10-K Summary"
}

ITEM_NAME_MAP_10Q_PART_I = {
    "1": "Financial Statements",
    "2": "Management's Discussion and Analysis of Financial Condition and Results of Operations",
    "3": "Quantitative and Qualitative Disclosures About Market Risk",
    "4": "Controls and Procedures",
}

ITEM_NAME_MAP_10Q_PART_II = {
    "1": "Legal Proceedings", "1A": "Risk Factors",
    "2": "Unregistered Sales of Equity Securities and Use of Proceeds",
    "3": "Defaults Upon Senior Securities", "4": "Mine Safety Disclosures",
    "5": "Other Information", "6": "Exhibits",
}

# =============================================================================
# 2. DATA STRUCTURES FOR BETTER ORGANIZATION
# =============================================================================

@dataclass
class FilingMetadata:
    """Structured metadata for a filing"""
    ticker: str
    form_type: str
    filing_date: str
    fiscal_year: int
    fiscal_quarter: int
    file_path: str

@dataclass
class DocumentSection:
    """Represents a section of the document"""
    title: str
    content: str
    section_type: str  # 'item', 'part', 'intro', 'table'
    item_number: Optional[str] = None
    part: Optional[str] = None
    start_pos: int = 0
    end_pos: int = 0

@dataclass
class Chunk:
    """Final chunk with all metadata"""
    chunk_id: str
    text: str
    token_count: int
    chunk_type: str  # 'narrative', 'table', 'mixed'
    section_info: str
    filing_metadata: FilingMetadata
    chunk_index: int
    has_overlap: bool = False

# =============================================================================
# 3. ROBUST TEXT CLEANING
# =============================================================================

def clean_sec_text(text: str) -> str:
    """
    Clean SEC filing text more robustly
    """
    # Remove common SEC artifacts
    text = re.sub(r'UNITED STATES\s+SECURITIES AND EXCHANGE COMMISSION.*?FORM \d+[A-Z]*', '', text, flags=re.DOTALL | re.IGNORECASE)

    # Handle page breaks more intelligently
    text = text.replace('[PAGE BREAK]', '\n\n--- PAGE BREAK ---\n\n')

    # Preserve table boundaries but clean them up
    text = re.sub(r'\[TABLE_START\]', '\n\n=== TABLE START ===\n', text)
    text = re.sub(r'\[TABLE_END\]', '\n=== TABLE END ===\n\n', text)

    # Clean up excessive whitespace but preserve paragraph structure
    text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text)  # Multiple newlines -> double newline
    text = re.sub(r'[ \t]+', ' ', text)  # Multiple spaces/tabs -> single space
    text = re.sub(r'^\s+|\s+$', '', text, flags=re.MULTILINE)  # Trim lines

    return text.strip()

# =============================================================================
# 4. MULTI-STRATEGY SECTION DETECTION
# =============================================================================

def detect_sections_strategy_1_improved(content: str) -> List[DocumentSection]:
    """
    Improved Strategy 1: Patterns based on real SEC filing structure
    """
    sections = []

    # Much more comprehensive patterns based on your actual files
    patterns = [
        # PART patterns - handle various formats
        re.compile(r'^\s*PART\s+([IVX]+)(?:\s*[-–—].*?)?$', re.I | re.M),
        re.compile(r'^PART\s+([IVX]+)(?:\s*[-–—].*?)?$', re.I | re.M),

        # ITEM patterns - much more flexible
        re.compile(r'^\s*ITEM\s+(\d{1,2}[A-C]?)(?:[.\s–—])', re.I | re.M),
        re.compile(r'^ITEM\s+(\d{1,2}[A-C]?)(?:[.\s–—])', re.I | re.M),
        re.compile(r'Item\s+(\d{1,2}[A-C]?)(?:[.\s–—])', re.I | re.M),

        # Number-dot format common in SEC filings
        re.compile(r'^(\d{1,2}[A-C]?)\.\s+[A-Z][A-Za-z\s]{10,}', re.I | re.M),

        # Content-based patterns for known sections
        re.compile(r'^.{0,50}(BUSINESS)\s*$', re.I | re.M),
        re.compile(r'^.{0,50}(RISK FACTORS)\s*$', re.I | re.M),
        re.compile(r'^.{0,50}(LEGAL PROCEEDINGS)\s*$', re.I | re.M),
        re.compile(r'^.{0,50}(FINANCIAL STATEMENTS)\s*$', re.I | re.M),
        re.compile(r'^.{0,50}(MANAGEMENT.S DISCUSSION)\s*', re.I | re.M),
        re.compile(r'^.{0,50}(PROPERTIES)\s*$', re.I | re.M),
        re.compile(r'^.{0,50}(CONTROLS AND PROCEDURES)\s*$', re.I | re.M),
    ]

    all_matches = []

    # Process each pattern
    for pattern_idx, pattern in enumerate(patterns):
        for match in pattern.finditer(content): # Use pre-compiled pattern
            # Get the full line containing this match
            line_start = content.rfind('\n', 0, match.start()) + 1
            line_end = content.find('\n', match.end())
            if line_end == -1:
                line_end = len(content)

            full_line = content[line_start:line_end].strip()

            # Filter out obvious false positives
            if (len(full_line) > 400 or  # Too long to be a header
                len(full_line) < 3 or    # Too short
                '|' in full_line or      # Likely table content
                full_line.count(' ') > 20):  # Too many words
                continue

            # Extract section identifier
            section_id = match.group(1) if match.groups() else 'unknown'

            all_matches.append({
                'start_pos': line_start, # Changed from match.start() for consistency with line-based detection
                'end_pos': line_end,     # Changed from match.end()
                'full_line': full_line,
                'section_id': section_id,
                'pattern_idx': pattern_idx,
                'match_start': match.start()
            })

    # Remove duplicates - matches within 200 characters of each other
    unique_matches = []
    for match in sorted(all_matches, key=lambda x: x['start_pos']):
        is_duplicate = any(
            abs(match['start_pos'] - existing['start_pos']) < 200
            for existing in unique_matches
        )
        if not is_duplicate:
            unique_matches.append(match)

    # Debug output
    print(f"🔍 Improved detection found {len(unique_matches)} potential sections:")
    for i, match in enumerate(unique_matches[:15]):  # Show more for debugging
        print(f"  {i+1}: {match['full_line'][:80]}...")

    # Convert to DocumentSection objects
    for i, match in enumerate(unique_matches):
        start_pos = match['start_pos']
        end_pos = unique_matches[i + 1]['start_pos'] if i + 1 < len(unique_matches) else len(content)

        section_content = content[start_pos:end_pos].strip()

        # Determine section type and metadata
        full_line_upper = match['full_line'].upper()
        section_id = match['section_id'].upper() if match['section_id'] != 'unknown' else None

        if 'PART' in full_line_upper and section_id:
            section_type = 'part'
            part = f"PART {section_id}"
            item_number = None
            title = f"Part {section_id}"
        elif ('ITEM' in full_line_upper or re.match(r'^\d+[A-C]?$', str(section_id))) and section_id:
            section_type = 'item'
            part = None
            item_number = section_id
            title = f"Item {section_id}"
        elif any(keyword in full_line_upper for keyword in
                ['BUSINESS', 'RISK', 'LEGAL', 'FINANCIAL', 'MANAGEMENT', 'PROPERTIES', 'CONTROLS']):
            section_type = 'named_section'
            part = None
            item_number = None
            title = match['full_line']
        else:
            section_type = 'content'
            part = None
            item_number = None
            title = match['full_line']

        sections.append(DocumentSection(
            title=title,
            content=section_content,
            section_type=section_type,
            item_number=item_number,
            part=part,
            start_pos=start_pos,
            end_pos=end_pos
        ))

    return sections

def detect_sections_strategy_2(content: str) -> List[DocumentSection]:
    """
    Strategy 2: Fallback using page breaks and heuristics
    """
    sections = []

    # Split by page breaks first
    pages = content.split('--- PAGE BREAK ---')

    current_section = ""
    current_title = "Document Content"

    for i, page in enumerate(pages):
        page = page.strip()
        if not page:
            continue

        # Look for section headers in the page
        lines = page.split('\n')
        potential_headers = []

        for j, line in enumerate(lines[:10]):  # Check first 10 lines of each page
            line = line.strip()
            if (len(line) < 100 and  # Headers are usually short
                (re.search(r'\b(ITEM|PART)\b', line, re.IGNORECASE) or
                 re.search(r'\b(BUSINESS|RISK FACTORS|FINANCIAL STATEMENTS)\b', line, re.IGNORECASE))):
                potential_headers.append((j, line))

        if potential_headers:
            # Found a header, start new section
            if current_section:
                sections.append(DocumentSection(
                    title=current_title,
                    content=current_section.strip(),
                    section_type='content',
                    start_pos=0,
                    end_pos=len(current_section)
                ))

            current_title = potential_headers[0][1]
            current_section = page
        else:
            # Continue current section
            current_section += "\n\n" + page

    # Add the last section
    if current_section:
        sections.append(DocumentSection(
            title=current_title,
            content=current_section.strip(),
            section_type='content',
            start_pos=0,
            end_pos=len(current_section)
        ))

    return sections

# The `detect_sections_robust` function from your original code (renamed detect_sections_robust_old to avoid conflict)
def detect_sections_robust_old(content: str) -> List[DocumentSection]:
    """
    Multi-strategy section detection with fallbacks (original version)
    """
    logger.info("Attempting Strategy 1: Regex-based section detection")
    sections = detect_sections_strategy_1_improved(content) # Original called detect_sections_strategy_1, updated to _improved

    if len(sections) >= 3:  # A reasonable number of sections to consider it successful
        logger.info(f"Strategy 1 successful: Found {len(sections)} sections")
        return sections

    logger.warning("Strategy 1 failed, trying Strategy 2: Page-based detection")
    sections = detect_sections_strategy_2(content)

    if len(sections) >= 2:
        logger.info(f"Strategy 2 successful: Found {len(sections)} sections")
        return sections

    logger.warning("All strategies failed, creating single section")
    return [DocumentSection(
        title="Full Document",
        content=content,
        section_type='document',
        start_pos=0,
        end_pos=len(content)
    )]

def create_section_info(section: DocumentSection, form_type: str) -> str:
    """
    Create human-readable section information
    """
    if section.section_type == 'item' and section.item_number:
        if form_type == '10K':
            item_name = ITEM_NAME_MAP_10K.get(section.item_number, "Unknown Section")
            return f"Item {section.item_number} - {item_name}"
        elif form_type == '10Q':
            # Determine which part this item belongs to
            if section.item_number in ITEM_NAME_MAP_10Q_PART_I:
                item_name = ITEM_NAME_MAP_10Q_PART_I[section.item_number]
                return f"Part I, Item {section.item_number} - {item_name}"
            else:
                item_name = ITEM_NAME_MAP_10Q_PART_II.get(section.item_number, "Unknown Section")
                return f"Part II, Item {section.item_number} - {item_name}"

    elif section.section_type == 'part' and section.part:
        return section.part

    else:
        return section.title or "Document Content"

def detect_sections_universal_sec(content: str) -> List[DocumentSection]:
    """
    Universal section detection for SEC filings with table-based formatting
    """
    sections = []

    if not content: # Added check for empty content
        logger.info("Empty content provided to detect_sections_universal_sec. Returning empty sections.")
        return sections

    # Universal patterns for table-formatted SEC filings
    # Using re.escape for literal brackets, and compiling patterns once.
    # Changed to match the exact string '[TABLE_START]' and '[TABLE_END]'
    patterns = [
        re.compile(re.escape('[TABLE_START]') + r'.*?Item\s+(\d{1,2}[A-C]?)\.\s*\|\s*([^\[]+?)\s*' + re.escape('[TABLE_END]'), re.I | re.DOTALL),
        re.compile(re.escape('[TABLE_START]') + r'.*?Item\s+(\d{1,2}[A-C]?)\.\s*\|\s*([^|]+)', re.I | re.DOTALL),

        re.compile(re.escape('[TABLE_START]') + r'.*?PART\s+([IVX]+)\s*\|\s*([^\[]+?)\s*' + re.escape('[TABLE_END]'), re.I | re.DOTALL),
        re.compile(re.escape('[TABLE_START]') + r'.*?PART\s+([IVX]+)\s*\|\s*([^|]+)', re.I | re.DOTALL),
        re.compile(re.escape('[TABLE_START]') + r'.*?PART\s+([IVX]+)\s*' + re.escape('[TABLE_END]'), re.I | re.DOTALL),

        # Standalone ITEM patterns (fallback) - using non-escaped word parts
        re.compile(r'^\s*Item\s+(\d{1,2}[A-C]?)\.\s*([^\n]+)', re.I | re.M),
        re.compile(r'Item\s+(\d{1,2}[A-C]?)\.\s*\|\s*([^|]+)', re.I | re.DOTALL),

        # Standalone PART patterns (fallback) - using non-escaped word parts
        re.compile(r'^\s*PART\s+([IVX]+)\s*([^\n]*)', re.I | re.M),
        re.compile(r'PART\s+([IVX]+)\s*\|\s*([^|]+)', re.I | re.DOTALL),

        # Number-only patterns in tables - using re.escape for markers
        re.compile(re.escape('[TABLE_START]') + r'.*?(\d{1,2}[A-C]?)\.\s*\|\s*([^|]+)', re.I | re.DOTALL),

        # Headers that might appear standalone - using non-escaped word parts
        re.compile(r'^(Item\s+\d{1,2}[A-C]?\.\s+[^|]+?)$', re.I | re.M),
        re.compile(r'^(PART\s+[IVX]+)(?:\s*[-–—]\s*(.+))?$', re.I | re.M),
    ]

    all_matches = []

    for pattern_idx, pattern in enumerate(patterns):
        for match in pattern.finditer(content): # Use pre-compiled pattern
            # Get context around the match (full line)
            line_start = content.rfind('\n', 0, match.start()) + 1
            line_end = content.find('\n', match.end())
            if line_end == -1:
                line_end = len(content)

            full_line = content[line_start:line_end].strip()

            # Filter out obvious false positives
            if (len(full_line) > 400 or  # Too long to be a header
                len(full_line) < 3 or    # Too short
                '|' in full_line or      # Likely table content
                full_line.count(' ') > 20):  # Too many words
                continue

            # Extract section information
            groups = match.groups()

            section_id = groups[0].strip() if groups else 'unknown'
            section_title = ""

            if len(groups) >= 2 and groups[1]:
                section_title = groups[1].strip()
                # Clean up section title from table markers - use actual markers here
                section_title = re.sub(re.escape('[TABLE_END]') + r'.*', '', section_title, flags=re.I).strip()
                section_title = section_title.replace('|', '').strip()
            elif len(groups) == 1:
                line_after_id_match = content[match.end():].split('\n')[0].strip()
                if line_after_id_match:
                    section_title = line_after_id_match
                else:
                    section_title = f"Section {section_id}"
            else:
                section_title = full_line

            all_matches.append({
                'start_pos': match.start(),
                'end_pos': match.end(),
                'full_line': full_line,
                'section_id': section_id,
                'section_title': section_title,
                'pattern_idx': pattern_idx,
                'match_start': match.start()
            })

    # Sort matches by their start position
    all_matches.sort(key=lambda x: x['start_pos'])

    # Remove duplicates and prioritize 'better' matches
    final_matches = []
    if all_matches:
        final_matches.append(all_matches[0]) # Add the first match
        for i in range(1, len(all_matches)):
            current_match = all_matches[i]
            last_added_match = final_matches[-1]

            # Heuristic for deciding whether to add or replace:
            # 1. If current match is significantly after the last added match, add it.
            if current_match['start_pos'] - last_added_match['start_pos'] > 150: # Increased distance for new section
                final_matches.append(current_match)
            # 2. If current match is very close, but provides a more specific 'item' or 'part' ID
            #    and the last added one was generic 'unknown' or less specific.
            elif current_match['start_pos'] - last_added_match['start_pos'] < 50: # Close proximity
                if last_added_match['section_id'] == 'unknown' and current_match['section_id'] != 'unknown':
                    final_matches[-1] = current_match # Replace with more specific match
                elif last_added_match['section_id'] == current_match['section_id'] and last_added_match['pattern_idx'] > current_match['pattern_idx']:
                    # If same ID but new pattern has higher priority (lower index means earlier in list)
                    final_matches[-1] = current_match

    logger.info(f"🔍 Universal SEC detection found {len(final_matches)} unique sections:")
    for i, match in enumerate(final_matches[:15]):
        logger.info(f"  {i+1}: Item/Part {match['section_id']} - {match['section_title'][:60]}...")

    # Convert to DocumentSection objects
    final_document_sections = []
    for i, match in enumerate(final_matches):
        start_pos = match['start_pos']
        # End position is the start of the next matched section, or end of content if it's the last one
        end_pos = final_matches[i + 1]['start_pos'] if i + 1 < len(final_matches) else len(content)

        section_content = content[start_pos:end_pos].strip()

        section_id = match['section_id'].upper()
        title = match['section_title']

        section_type = 'content'
        item_number = None
        part = None

        if re.match(r'^[IVX]+$', section_id):
            section_type = 'part'
            part = f"PART {section_id}"
            if title.upper().startswith("PART ") and title.upper().replace("PART ", "").strip() == section_id:
                title = part
            elif not title:
                 title = part
        elif re.match(r'^\d+[A-C]?$', section_id):
            section_type = 'item'
            item_number = section_id
            if title.upper().startswith("ITEM ") and title.upper().replace("ITEM ", "").strip() == section_id:
                title = f"Item {item_number}"
            elif not title:
                 title = f"Item {item_number}"

        final_document_sections.append(DocumentSection(
            title=title,
            content=section_content,
            section_type=section_type,
            item_number=item_number,
            part=part,
            start_pos=start_pos,
            end_pos=end_pos
        ))

    return final_document_sections

def detect_sections_from_toc_universal(content: str) -> List[DocumentSection]:
    """
    Extract sections from table of contents - works for any SEC filing.
    This function primarily identifies section titles and item numbers from TOC,
    but does not extract their content directly.
    """
    sections = []

    if not content:
        logger.info("Empty content provided to detect_sections_from_toc_universal. Returning empty sections.")
        return sections

    # Look for table of contents patterns
    # Using re.escape for literal brackets and compiling patterns once.
    toc_patterns = [
        re.compile(r'(?i)INDEX.*?(?=\s*--- PAGE BREAK ---)', re.DOTALL),
        re.compile(r'(?i)TABLE OF CONTENTS.*?(?=\s*--- PAGE BREAK ---)', re.DOTALL),
        re.compile(r'(?i)FORM 10-[KQ].*?INDEX.*?(?=\s*--- PAGE BREAK ---)', re.DOTALL),
        re.compile(re.escape('[TABLE_START]') + r'.*?Page.*?' + re.escape('[TABLE_END]') + r'.*?(?=\s*--- PAGE BREAK ---)', re.DOTALL),
    ]

    toc_content = ""
    for pattern in toc_patterns:
        match = pattern.search(content)
        if match:
            toc_content = match.group(0)
            break

    if not toc_content:
        logger.warning("No table of contents found in detect_sections_from_toc_universal.")
        return sections

    logger.info(f"Found table of contents ({len(toc_content)} chars)")

    # Define patterns for items/parts within the TOC
    # Removed re.escape from words like "Item" and "PART" as they are literal words not regex metacharacters here
    item_patterns = [
        re.compile(r'(?i)Item\\s+(\\d{1,2}[A-C]?)\\.\\s*\\|\\s*([^|]+?)\\s*\\|\\s*\\d+', re.DOTALL), # Standard table format
        re.compile(r'(?i)PART\\s+([IVX]+)\\s*\\|\\s*([^|]+)', re.DOTALL), # Part in table
        re.compile(r'(?i)Item\\s+(\\d{1,2}[A-C]?)\\.\\s+([^|\\d]+)', re.M), # Standalone Item line (non-table)
        re.compile(r'(?i)(\\d{1,2}[A-C]?)\\.\\s*\\|\\s*([^|]+?)\\s*\\|\\s*\\d+', re.DOTALL), # Number-dot item in table
        re.compile(r'(?i)PART\\s+([IVX]+)', re.M) # Simple PART line (non-table)
    ]

    found_items = []
    for pattern in item_patterns:
        for match in pattern.finditer(toc_content):
            groups = match.groups()
            if len(groups) >= 2:
                item_id = groups[0].strip()
                item_title = groups[1].strip()
                item_title = re.sub(r'\\s+', ' ', item_title)
                found_items.append((item_id, item_title))
            elif len(groups) == 1:
                item_id = groups[0].strip()
                line_after_id_match = toc_content[match.end():].split('\n')[0].strip()
                if line_after_id_match:
                    item_title = line_after_id_match
                else:
                    item_title = f"Section {item_id}"
                item_title = re.sub(r'\\s+', ' ', item_title)
                found_items.append((item_id, item_title))

    unique_items = []
    seen = set()
    for item_id, title in found_items:
        key = f"{item_id}_{title[:50]}"
        if key not in seen:
            unique_items.append((item_id, title))
            seen.add(key)

    logger.info(f"Extracted {len(unique_items)} sections from table of contents:")
    for item_id, title in unique_items[:10]:
        logger.info(f"  • {item_id}: {title[:50]}...")

    toc_sections = []
    for item_id, title in unique_items:
        section_type = 'unknown'
        item_number = None
        part_num = None

        if re.match(r'^\d+[A-C]?$', item_id):
            section_type = 'item'
            item_number = item_id
        elif re.match(r'^[IVX]+$', item_id):
            section_type = 'part'
            part_num = item_id

        toc_sections.append(DocumentSection(
            title=title,
            content="",
            section_type=section_type,
            item_number=item_number,
            part=part_num
        ))
    return toc_sections


def detect_sections_robust_universal(content: str) -> List[DocumentSection]:
    """
    Universal robust section detection for all SEC filings.
    Prioritizes direct pattern matching (which handles tables well), then TOC, then page-based.
    """
    logger.info("Attempting universal SEC section detection")

    # Strategy 1: Direct pattern matching for sections (designed to work well with common SEC patterns)
    sections_strategy1 = detect_sections_universal_sec(content)

    if len(sections_strategy1) >= 3:
        logger.info(f"Universal detection successful (Strategy 1): Found {len(sections_strategy1)} sections.")
        return sections_strategy1

    # Strategy 2: Try parsing Table of Contents.
    logger.warning("Direct detection found few sections, analyzing table of contents.")
    toc_entries = detect_sections_from_toc_universal(content)

    if toc_entries and len(toc_entries) >= 3:
        logger.info(f"TOC analysis found {len(toc_entries)} potential sections. Attempting to extract content based on TOC titles.")

        combined_sections = []
        current_content_pos = 0

        for i, toc_entry in enumerate(toc_entries):
            search_title = re.escape(toc_entry.title).replace('\\ ', '\\s*')

            if toc_entry.item_number:
                # Prioritize searching for "Item X. Title" or "PART X Title"
                # This regex tries to match either the standardized Item/Part format or the escaped title
                pattern_str = r'(?i)^\s*(?:'
                if toc_entry.item_number:
                    pattern_str += r'Item\s*' + re.escape(toc_entry.item_number) + r'\.'
                elif toc_entry.part:
                    pattern_str += r'PART\s*' + re.escape(toc_entry.part)
                
                # Add the title as an alternative match
                if search_title:
                    pattern_str += r'|' + search_title
                pattern_str += r')'
                search_pattern = re.compile(pattern_str, re.M)
            else:
                search_pattern = re.compile(r'(?i)^\s*' + search_title, re.M)

            match = search_pattern.search(content, pos=current_content_pos)

            if match:
                start_pos = match.start()

                next_start_pos = len(content)
                if i + 1 < len(toc_entries):
                    next_toc_entry = toc_entries[i+1]
                    next_search_title = re.escape(next_toc_entry.title).replace('\\ ', '\\s*')

                    next_pattern_str = r'(?i)^\s*(?:'
                    if next_toc_entry.item_number:
                        next_pattern_str += r'Item\s*' + re.escape(next_toc_entry.item_number) + r'\.'
                    elif next_toc_entry.part:
                        next_pattern_str += r'PART\s*' + re.escape(next_toc_entry.part)
                    
                    if next_search_title:
                        next_pattern_str += r'|' + next_search_title
                    next_pattern_str += r')'
                    next_pattern = re.compile(next_pattern_str, re.M)
                    
                    # Search for the next section from the end of the current section's match
                    next_match = next_pattern.search(content, pos=match.end())
                    if next_match:
                        next_start_pos = next_match.start()

                section_content = content[start_pos:next_start_pos].strip()

                combined_sections.append(DocumentSection(
                    title=toc_entry.title,
                    content=section_content,
                    section_type=toc_entry.section_type,
                    item_number=toc_entry.item_number,
                    part=toc_entry.part,
                    start_pos=start_pos,
                    end_pos=next_start_pos
                ))
                current_content_pos = next_start_pos
            else:
                logger.warning(f"Could not find content for TOC entry: '{toc_entry.title}'. This section might be merged with previous or skipped.")

        if len(combined_sections) >= 3:
            logger.info(f"Universal detection successful (TOC-based content mapping): Found {len(combined_sections)} sections.")
            return combined_sections
        else:
            logger.warning("TOC-based content mapping yielded few sections. Falling back to page-based detection.")


    # Strategy 3: Page-based fallback (original strategy 2)
    logger.warning("Trying page-based detection as fallback.")
    sections_strategy2 = detect_sections_strategy_2(content)

    if len(sections_strategy2) >= 2:
        logger.info(f"Page-based detection successful: Found {len(sections_strategy2)} sections.")
        return sections_strategy2

    # Final fallback: return the entire document as a single section
    logger.warning("All strategies failed, creating single section.")
    return [DocumentSection(
        title="Full Document",
        content=content,
        section_type='document',
        start_pos=0,
        end_pos=len(content)
    )]


# Helper function to extract metadata from filename
def extract_metadata_from_filename(file_path: str) -> FilingMetadata:
    filename = Path(file_path).name
    file_id = filename.replace(".txt", "")
    parts = file_id.split('_')

    if len(parts) != 3:
        logger.warning(f"Malformed filename: {filename}. Using default metadata.")
        return FilingMetadata(
            ticker="UNKNOWN",
            form_type="UNKNOWN",
            filing_date="1900-01-01",
            fiscal_year=1900,
            fiscal_quarter=1,
            file_path=file_path
        )

    ticker, form_type, filing_date_str = parts

    try:
        filing_date = pd.to_datetime(filing_date_str)
        fiscal_year = filing_date.year
        fiscal_quarter = filing_date.quarter
    except pd.errors.ParserError:
        logger.error(f"Could not parse filing date from {filing_date_str} in {filename}. Using default values.")
        fiscal_year = 1900
        fiscal_quarter = 1

    # Adjust fiscal year for 10-K filings if the filing date is early in the calendar year
    # and typically refers to the previous fiscal year end.
    if form_type == '10K' and filing_date.month <= 3: # Assuming fiscal year ends typically in Dec or Jan-Mar for previous year
        fiscal_year -= 1 # Often a 10K filed in Jan-Mar of current year is for previous fiscal year

    return FilingMetadata(
        ticker=ticker,
        form_type=form_type,
        filing_date=filing_date_str,
        fiscal_year=fiscal_year,
        fiscal_quarter=fiscal_quarter,
        file_path=file_path
    )


# =============================================================================
# MAIN PROCESSING FUNCTION (Universal)
# =============================================================================
def process_filing_robust_universal(file_path: str, target_tokens: int = 500, overlap_tokens: int = 100) -> List[Chunk]:
    """
    Universal processing function for all SEC filings
    """
    try:
        # Extract filing metadata
        filing_metadata = extract_metadata_from_filename(file_path)
        filename = Path(file_path).name # For logging clarity
        file_id = filename.replace(".txt", "") # For chunk_id creation

        # Read and clean content
        with open(file_path, 'r', encoding='utf-8') as f:
            raw_content = f.read()
        cleaned_content = clean_sec_text(raw_content)

        # Basic check for empty content after cleaning
        if not cleaned_content.strip():
            logger.warning(f"Cleaned content for {filename} is empty. No chunks created.")
            return []

        # Use universal section detection
        sections = detect_sections_robust_universal(cleaned_content)
        logger.info(f"Found {len(sections)} sections in {filename}")

        # Process each section
        all_chunks = []
        chunk_counter = 0

        for section in sections:
            # Ensure section.content is not empty before processing
            if not section.content.strip():
                continue # Skip empty sections

            # Extract tables and narrative from this section's content
            tables_in_section, narrative_content_in_section = extract_and_process_tables(section.content)

            # Create section info string using the original create_section_info
            section_info = create_section_info(section, filing_metadata.form_type)

            # Process tables found within this section
            for table in tables_in_section:
                chunk = Chunk(
                    chunk_id=f"{file_id}-chunk-{chunk_counter:04d}",
                    text=table['text'],
                    token_count=table['token_count'],
                    chunk_type='table',
                    section_info=section_info,
                    filing_metadata=filing_metadata,
                    chunk_index=chunk_counter,
                    has_overlap=False
                )
                all_chunks.append(chunk)
                chunk_counter += 1

            # Process narrative content from this section
            if narrative_content_in_section.strip():
                # Use the existing create_overlapping_chunks for narrative
                narrative_sub_chunks = create_overlapping_chunks(
                    narrative_content_in_section, target_tokens, overlap_tokens
                )

                for chunk_data in narrative_sub_chunks:
                    chunk = Chunk(
                        chunk_id=f"{file_id}-chunk-{chunk_counter:04d}",
                        text=chunk_data['text'],
                        token_count=chunk_data['token_count'],
                        chunk_type='narrative',
                        section_info=section_info,
                        filing_metadata=filing_metadata,
                        chunk_index=chunk_counter,
                        has_overlap=chunk_data['has_overlap']
                    )
                    all_chunks.append(chunk)
                    chunk_counter += 1

        logger.info(f"Created {len(all_chunks)} chunks for {filename}")
        return all_chunks

    except Exception as e:
        logger.error(f"Error processing {file_path}: {e}")
        return []

# =============================================================================
# 5. IMPROVED SENTENCE-AWARE CHUNKING
# =============================================================================

def split_into_sentences(text: str) -> List[str]:
    """
    Split text into sentences using multiple heuristics
    """
    # Simple sentence splitting (can be improved with spaCy/NLTK)
    sentences = re.split(r'(?<=[.!?])\s+(?=[A-Z])', text)

    # Clean up sentences
    sentences = [s.strip() for s in sentences if s.strip()]

    return sentences

def create_overlapping_chunks(text: str, target_tokens: int = 500, overlap_tokens: int = 100,
                            min_tokens: int = 50) -> List[Dict[str, Any]]:
    """
    Create semantically aware chunks with overlap
    """
    sentences = split_into_sentences(text)
    chunks = []

    current_chunk_sentences = []
    current_tokens = 0

    for i, sentence in enumerate(sentences):
        sentence_tokens = len(encoding.encode(sentence))

        # If adding this sentence exceeds target, finalize current chunk
        if current_tokens + sentence_tokens > target_tokens and current_chunk_sentences:
            chunk_text = ' '.join(current_chunk_sentences)
            chunks.append({
                'text': chunk_text,
                'token_count': current_tokens,
                'sentence_count': len(current_chunk_sentences),
                'has_overlap': len(chunks) > 0
            })

            # Create overlap: keep last few sentences
            overlap_sentences = []
            current_overlap_tokens = 0 # Renamed variable to avoid conflict with function parameter 'overlap_tokens'

            # Add sentences from the end until we reach overlap target
            # Ensure we don't go past the start of the chunk
            for sent_idx in range(len(current_chunk_sentences) - 1, -1, -1):
                sent = current_chunk_sentences[sent_idx]
                sent_tokens = len(encoding.encode(sent))
                if current_overlap_tokens + sent_tokens <= overlap_tokens:
                    overlap_sentences.insert(0, sent)
                    current_overlap_tokens += sent_tokens
                else:
                    break
            
            # If after trying to create overlap, we still don't have enough tokens for overlap
            # (e.g., first few sentences are very long), just take some minimal content.
            if not overlap_sentences and current_chunk_sentences:
                # Fallback to last sentence if no other overlap possible and current chunk exists
                overlap_sentences = [current_chunk_sentences[-1]]
                current_overlap_tokens = len(encoding.encode(overlap_sentences[0]))


            # Start new chunk with overlap + current sentence
            current_chunk_sentences = overlap_sentences + [sentence]
            current_tokens = current_overlap_tokens + sentence_tokens
        else:
            # Add sentence to current chunk
            current_chunk_sentences.append(sentence)
            current_tokens += sentence_tokens

    # Add final chunk if it has content
    if current_chunk_sentences:
        chunk_text = ' '.join(current_chunk_sentences)
        final_tokens = len(encoding.encode(chunk_text))

        if final_tokens >= min_tokens:
            chunks.append({
                'text': chunk_text,
                'token_count': final_tokens,
                'sentence_count': len(current_chunk_sentences),
                'has_overlap': len(chunks) > 0
            })

    return chunks

# =============================================================================
# 6. TABLE HANDLING
# =============================================================================

def extract_and_process_tables(content: str) -> Tuple[List[Dict], str]:
    """
    Extract tables and return both table chunks and narrative text
    """
    table_pattern = re.compile(r'=== TABLE START ===.*?=== TABLE END ===', re.DOTALL)
    tables = []

    # Find all tables
    for i, match in enumerate(table_pattern.finditer(content)):
        table_content = match.group(0)
        # Clean table markers
        table_text = table_content.replace('=== TABLE START ===', '').replace('=== TABLE END ===', '').strip()

        if table_text:  # Only add non-empty tables
            tables.append({
                'text': table_text,
                'token_count': len(encoding.encode(table_text)),
                'table_index': i,
                'chunk_type': 'table'
            })

    # Remove tables from content to get narrative text
    narrative_content = table_pattern.sub('', content).strip()

    return tables, narrative_content

# =============================================================================
# 8. TESTING AND VALIDATION
# =============================================================================

def validate_chunks(chunks: List[Chunk]) -> Dict[str, Any]:
    """
    Validate the quality of our chunks
    """
    if not chunks:
        return {"error": "No chunks created"}

    token_counts = [chunk.token_count for chunk in chunks]

    stats = {
        "total_chunks": len(chunks),
        "avg_tokens": sum(token_counts) / len(token_counts),
        "min_tokens": min(token_counts),
        "max_tokens": max(token_counts),
        "chunks_with_overlap": sum(1 for chunk in chunks if chunk.has_overlap),
        "table_chunks": sum(1 for chunk in chunks if chunk.chunk_type == 'table'),
        "narrative_chunks": sum(1 for chunk in chunks if chunk.chunk_type == 'narrative'),
        "unique_sections": len(set(chunk.section_info for chunk in chunks))
    }

    return stats

# =============================================================================
# 9. LET'S TEST THIS!
# =============================================================================

print("🚀 SEC Filing Preprocessing Strategy - Ready for Testing!\n")
print("="*60)
print("Key improvements over original approach:\n")
print("✅ Multi-strategy section detection with fallbacks\n")
print("✅ Sentence-aware chunking with overlap\n")
print("✅ Robust error handling and logging\n")
print("✅ Structured data classes for better organization\n")
print("✅ Quality validation and statistics\n")
print("✅ Separate table and narrative processing\n")
print("="*60)


def test_single_file():
    """Test our preprocessing on a single file"""
    # Replace with an actual file path from your processed_filings directory
    test_file = "processed_filings/AAPL/AAPL_10K_2020-10-30.txt"

    if os.path.exists(test_file):
        print(f"🧪 Testing with: {test_file}\n")
        print("="*50)

        # Changed to universal processing function
        chunks = process_filing_robust_universal(test_file)
        stats = validate_chunks(chunks)

        print("📊 Processing Results:\n")
        for key, value in stats.items():
            print(f"  {key}: {value}\n")

        print("\n📝 Sample Chunks:\n")
        for i, chunk in enumerate(chunks[:3]):  # Show first 3 chunks
            print(f"\nChunk {i+1} ({chunk.chunk_type}):\n")
            print(f"  Section: {chunk.section_info}\n")
            print(f"  Tokens: {chunk.token_count}\n")
            print(f"  Text preview: {chunk.text[:200]}...\n")

        return chunks
    else:
        print(f"❌ File not found: {test_file}\n")
        print("Please update the file path to match your data structure\n")
        return []

# Run the test
chunks = test_single_file()

def compare_section_strategies(content: str): # Changed content_sample to content to use full content
    """Compare how different strategies perform"""
    print("🔍 Comparing Section Detection Strategies\n")
    print("="*50)

    # Strategy 1: Robust regex
    sections_1 = detect_sections_strategy_1_improved(content) # Changed content_sample to content
    print(f"Strategy 1 (Regex): {len(sections_1)} sections\n")
    for i, section in enumerate(sections_1[:5]):  # Show first 5
        print(f"  {i+1}. {section.title[:60]}...\n")

    print()

    # Strategy 2: Page-based fallback
    sections_2 = detect_sections_strategy_2(content) # Changed content_sample to content
    print(f"Strategy 2 (Page-based): {len(sections_2)} sections\n")
    for i, section in enumerate(sections_2[:5]):  # Show first 5
        print(f"  {i+1}. {section.title[:60]}...\n")

    return sections_1, sections_2

# Test if we have chunks from previous test
if chunks:
    # Use the first chunk's filing to get the full content
    test_file = chunks[0].filing_metadata.file_path
    with open(test_file, 'r', encoding='utf-8') as f:
        # Load full content for comparison, not just a sample
        full_content_for_comparison = f.read()
    cleaned_content_for_comparison = clean_sec_text(full_content_for_comparison) # Clean it for consistent comparison

    sections_1_comp, sections_2_comp = compare_section_strategies(cleaned_content_for_comparison)


def analyze_chunking_quality(chunks: List[Chunk]):
    """Deep dive into chunk quality"""
    if not chunks:
        print("No chunks to analyze\n")
        return

    print("📊 Chunking Quality Analysis\n")
    print("="*50)

    # Token distribution
    token_counts = [chunk.token_count for chunk in chunks]

    print(f"Token Distribution:\n")
    print(f"  Mean: {sum(token_counts)/len(token_counts):.1f}\n")
    print(f"  Median: {sorted(token_counts)[len(token_counts)//2]}\n")
    print(f"  Min: {min(token_counts)}\n")
    print(f"  Max: {max(token_counts)}\n")

    # Chunk types
    chunk_types = {}
    for chunk in chunks:
        chunk_types[chunk.chunk_type] = chunk_types.get(chunk.chunk_type, 0) + 1

    print(f"\nChunk Types:\n")
    for chunk_type, count in chunk_types.items():
        print(f"  {chunk_type}: {count}\n")

    # Section distribution
    sections_dist = {} # Renamed to avoid conflict with `sections` list
    for chunk in chunks:
        sections_dist[chunk.section_info] = sections_dist.get(chunk.section_info, 0) + 1

    print(f"\nSection Distribution:\n")
    for section, count in sorted(sections_dist.items()):
        print(f"  {section}: {count} chunks\n")

    # Overlap analysis
    overlap_count = sum(1 for chunk in chunks if chunk.has_overlap)
    print(f"\nOverlap Analysis:\n")
    print(f"  Chunks with overlap: {overlap_count}/{len(chunks)} ({overlap_count/len(chunks)*100:.1f}%)\n")

    return {
        'token_stats': {
            'mean': sum(token_counts)/len(token_counts),
            'median': sorted(token_counts)[len(token_counts)//2],
            'min': min(token_counts),
            'max': max(token_counts)
        },
        'chunk_types': chunk_types,
        'sections': sections_dist,
        'overlap_rate': overlap_count/len(chunks)
    }

# Analyze our test chunks
if chunks:
    quality_analysis = analyze_chunking_quality(chunks)


def test_chunking_parameters():
    """Test different parameter combinations"""
    if not chunks:
        print("No test file processed yet\n")
        return

    test_file = chunks[0].filing_metadata.file_path

    print("🔧 Testing Different Chunking Parameters\n")
    print("="*50)

    # Test different parameter combinations
    param_configs = [
        {"target_tokens": 300, "overlap_tokens": 50, "name": "Small chunks, low overlap"},
        {"target_tokens": 500, "overlap_tokens": 100, "name": "Medium chunks, medium overlap"},
        {"target_tokens": 800, "overlap_tokens": 150, "name": "Large chunks, high overlap"},
    ]

    results = {}

    for config in param_configs:
        print(f"\n🧪 Testing: {config['name']}\n")
        # Changed to universal processing function
        test_chunks = process_filing_robust_universal(
            test_file,
            target_tokens=config['target_tokens'],
            overlap_tokens=config['overlap_tokens']
        )

        stats = validate_chunks(test_chunks)
        results[config['name']] = stats

        print(f"  Total chunks: {stats['total_chunks']}\n")
        print(f"  Avg tokens: {stats['avg_tokens']:.1f}\n")
        print(f"  Overlap rate: {stats['chunks_with_overlap']}/{stats['total_chunks']}\n")

    return results

# Test different parameters
param_results = test_chunking_parameters()


def test_error_handling():
    """Test how our system handles various edge cases"""
    print("🛡️ Testing Error Handling\n")
    print("="*50)

    # Test 1: Non-existent file
    print("Test 1: Non-existent file\n")
    # Changed to universal processing function
    fake_chunks = process_filing_robust_universal("non_existent_file.txt")
    print(f"  Result: {len(fake_chunks)} chunks (expected 0)\n")

    # Test 2: Empty file
    print("\nTest 2: Empty content\n")
    empty_sections = detect_sections_robust_universal("") # Changed to universal detection
    print(f"  Result: {len(empty_sections)} sections\n")

    # Test 3: Malformed filename
    print("\nTest 3: Malformed filename\n")
    # Create a temporary file with bad name
    import tempfile
    with tempfile.NamedTemporaryFile(mode='w', suffix='_bad_name.txt', delete=False) as f:
        f.write("Some content")
        temp_file = f.name

    # Changed to universal processing function
    bad_chunks = process_filing_robust_universal(temp_file)
    print(f"  Result: {len(bad_chunks)} chunks (expected 0)\n")

    # Clean up
    os.unlink(temp_file)

    # Test 4: Very short text
    print("\nTest 4: Very short text\n")
    # This call is correct, as create_overlapping_chunks is a helper
    short_chunks = create_overlapping_chunks("Short text.", target_tokens=500)
    print(f"  Result: {len(short_chunks)} chunks\n")

test_error_handling()


def test_batch_processing(max_files: int = 5):
    """Test processing multiple files"""
    print(f"🔄 Testing Batch Processing (max {max_files} files)\n")
    print("="*50)

    data_path = "processed_filings/"
    if not os.path.exists(data_path):
        print(f"❌ Data path not found: {data_path}\n")
        return []

    # Get all files
    all_files = []
    for root, dirs, files in os.walk(data_path):
        for file in files:
            if file.endswith('.txt'):
                all_files.append(os.path.join(root, file))

    # Process a subset
    test_files = all_files[:max_files]
    print(f"Processing {len(test_files)} files...\n")

    all_results = []

    for i, file_path in enumerate(test_files):
        print(f"  {i+1}/{len(test_files)}: {os.path.basename(file_path)}\n")

        # Changed to universal processing function
        file_chunks = process_filing_robust_universal(file_path)
        stats = validate_chunks(file_chunks)

        all_results.append({
            'file': os.path.basename(file_path),
            'chunks': len(file_chunks),
            'avg_tokens': stats.get('avg_tokens', 0),
            'sections': stats.get('unique_sections', 0),
            'tables': stats.get('table_chunks', 0)
        })

    # Summary statistics
    print(f"\n📊 Batch Processing Summary:\n")
    total_chunks = sum(r['chunks'] for r in all_results)
    avg_chunks_per_file = total_chunks / len(all_results) if all_results else 0

    print(f"  Total files processed: {len(all_results)}\n")
    print(f"  Total chunks created: {total_chunks}\n")
    print(f"  Average chunks per file: {avg_chunks_per_file:.1f}\n")

    print(f"\n📋 Per-file results:\n")
    for result in all_results:
        print(f"  {result['file']}: {result['chunks']} chunks, {result['sections']} sections, {result['tables']} tables\n")

    return all_results

# Run batch test
batch_results = test_batch_processing(max_files=3)


def create_analysis_summary():
    """Create a comprehensive summary of our preprocessing"""
    print("📈 Final Analysis Summary\n")
    print("="*60)

    # Assumes 'chunks' variable from test_single_file() is available
    if 'chunks' not in globals() or not chunks:
        print("No chunks to analyze - run test_single_file() first\n")
        return

    # Create a mini dataset for analysis
    chunk_data = []
    for chunk in chunks:
        chunk_data.append({
            'chunk_id': chunk.chunk_id,
            'tokens': chunk.token_count,
            'type': chunk.chunk_type,
            'section': chunk.section_info,
            'has_overlap': chunk.has_overlap,
            'ticker': chunk.filing_metadata.ticker,
            'form_type': chunk.filing_metadata.form_type,
            'fiscal_year': chunk.filing_metadata.fiscal_year
        })

    df = pd.DataFrame(chunk_data)

    print("🎯 Key Insights:\n")
    print(f"  • Document: {df['ticker'].iloc[0]} {df['form_type'].iloc[0]} (FY{df['fiscal_year'].iloc[0]})\n")
    print(f"  • Total chunks: {len(df)}\n")
    print(f"  • Average chunk size: {df['tokens'].mean():.0f} tokens\n")
    print(f"  • Size range: {df['tokens'].min()} - {df['tokens'].max()} tokens\n")
    print(f"  • Overlap rate: {(df['has_overlap'].sum() / len(df) * 100):.1f}%\n")

    print(f"\n📊 Chunk Distribution by Type:\n")
    type_dist = df['type'].value_counts()
    for chunk_type, count in type_dist.items():
        percentage = (count / len(df)) * 100
        print(f"  • {chunk_type}: {count} chunks ({percentage:.1f}%)\n")

    print(f"\n📚 Section Breakdown:\n")
    section_dist = df['section'].value_counts()
    for section, count in section_dist.head(8).items():  # Top 8 sections
        print(f"  • {section}: {count} chunks\n")

    # Quality metrics
    print(f"\n✅ Quality Metrics:\n")

    # Check for very small chunks (potential issues)
    small_chunks = df[df['tokens'] < 50]
    print(f"  • Very small chunks (<50 tokens): {len(small_chunks)} ({len(small_chunks)/len(df)*100:.1f}%)\n")

    # Check for very large chunks (might need splitting)
    large_chunks = df[df['tokens'] > 800]
    print(f"  • Large chunks (>800 tokens): {len(large_chunks)} ({len(large_chunks)/len(df)*100:.1f}%)\n")

    # Check section coverage
    unique_sections = df['section'].nunique()
    print(f"  • Unique sections identified: {unique_sections}\n")

    # Show some example chunks for manual review
    print(f"\n🔍 Sample Chunks for Review:\n")

    # Show one of each type
    for chunk_type in df['type'].unique():
        sample = df[df['type'] == chunk_type].iloc[0]
        # Find the actual chunk object to get its full text
        chunk_obj = next(c for c in chunks if c.chunk_id == sample['chunk_id'])
        print(f"\n  {chunk_type.upper()} example ({sample['tokens']} tokens):\n")
        print(f"    Section: {sample['section']}\n")
        print(f"    Preview: {chunk_obj.text[:150]}...\n")

    return df

# Create final summary
summary_df = create_analysis_summary()


def compare_with_original():
    """Compare our approach with the original chunking strategy"""
    print("⚖️ Comparison: New vs Original Approach\n")
    print("="*60)

    improvements = [
        "✅ Multi-strategy section detection (fallbacks for robustness)",
        "✅ Sentence-aware chunking (preserves semantic boundaries)",
        "✅ Overlapping chunks (maintains context across boundaries)",
        "✅ Separate table processing (handles structured data better)",
        "✅ Comprehensive error handling (graceful degradation)",
        "✅ Rich metadata structure (better for search/filtering)",
        "✅ Quality validation (ensures chunk coherence)",
        "✅ Configurable parameters (tunable for different use cases)"
    ]

    potential_tradeoffs = [
        "⚠️ Slightly more complex code (but more maintainable)",
        "⚠️ More chunks due to overlap (but better retrieval)",
        "⚠️ Processing takes longer (but more robust results)"
    ]

    print("🚀 Key Improvements:\n")
    for improvement in improvements:
        print(f"  {improvement}\n")

    print(f"\n⚖️ Potential Tradeoffs:\n")
    for tradeoff in potential_tradeoffs:
        print(f"  {tradeoff}\n")

    print(f"\n🎯 Recommended Next Steps:\n")
    next_steps = [
        "1. Test on more diverse filings to validate robustness",
        "2. Fine-tune chunking parameters based on embedding performance",
        "3. Add semantic similarity checks between overlapping chunks",
        "4. Implement incremental processing for large datasets",
        "5. Add support for other SEC forms (8-K, DEF 14A, etc.)",
        "6. Create embedding quality metrics and evaluation"
    ]

    for step in next_steps:
        print(f"  {step}\n")

    print("\n" + "="*60)
    print("🎉 Preprocessing Strategy Testing Complete!\n")
    print("="*60)
    print("Next step: Convert this notebook into modular Python files\n")
    print("Then: Implement the embedding pipeline and MCP server!\n")
    print("="*60)

compare_with_original()

# Test functions adapted to _fixed suffix to avoid NameErrors from notebook re-runs
# Ensure these are called after all function definitions.
print("🚀 Ready to test universal SEC detection!\n")
print("\n1. Run test_universal_detection_fixed() to test all files\n")
print("2. Run compare_old_vs_universal_fixed() to see the improvement\n")
print("3. Run quick_pattern_test_fixed() to see what patterns match\n")

# Define the _fixed test functions so they are available when called below
def test_universal_detection_fixed():
    """Test the universal detection on all your file types"""

    test_files = [
        "processed_filings/AAPL/AAPL_10K_2020-10-30.txt",
        "processed_filings/AMZN/AMZN_10K_2023-02-03.txt",
        "processed_filings/AMZN/AMZN_10Q_2024-11-01.txt", # This file name is in the future based on current date
        "processed_filings/KO/KO_10Q_2020-07-22.txt"
    ]

    results = {}

    for test_file in test_files:
        if not os.path.exists(test_file):
            print(f"⚠️ Skipping {test_file} - file not found\n")
            continue

        print(f"\n🧪 Testing: {test_file}\n")
        print("=" * 80)

        with open(test_file, 'r', encoding='utf-8') as f:
            content = f.read()

        # Test universal detection
        sections = detect_sections_robust_universal(content)

        print(f"\n✅ Found {len(sections)} sections:\n")
        for i, section in enumerate(sections[:10]):
            print(f"  {i+1}. {section.title}\n")
            print(f"     Type: {section.section_type}, Length: {len(section.content):,} chars\n")

        # Test full pipeline
        chunks = process_filing_robust_universal(test_file)
        stats = validate_chunks(chunks) if chunks else {"error": "No chunks created"}

        results[test_file] = {
            'sections': len(sections),
            'chunks': len(chunks) if chunks else 0,
            'stats': stats
        }

        print(f"\n📊 Processing Results:\n")
        for key, value in stats.items():
            print(f"  {key}: {value}\n")

        if chunks:
            section_counts = {}
            for chunk in chunks[:20]:
                section = chunk.section_info
                section_counts[section] = section_counts.get(section, 0) + 1

            print(f"\n📚 Section Distribution (sample):\n")
            for section, count in sorted(section_counts.items()):
                print(f"  • {section}: {count} chunks\n")

    print(f"\n" + "="*80)
    print("📊 UNIVERSAL DETECTION SUMMARY\n")
    print("="*80)

    for file_path, result in results.items():
        filename = file_path.split('/')[-1]
        print(f"{filename:<25} | {result['sections']:>2} sections | {result['chunks']:>3} chunks\n")

    return results

def compare_old_vs_universal_fixed():
    """Compare the old detection vs universal detection"""
    test_file = "processed_filings/AAPL/AAPL_10K_2020-10-30.txt"

    if not os.path.exists(test_file):
        print("Test file not found for comparison\n")
        return

    print("⚖️ OLD vs UNIVERSAL Detection Comparison\n")
    print("="*60)

    with open(test_file, 'r', encoding='utf-8') as f:
        content = f.read()

    print("Running old detection...\n")
    old_sections = detect_sections_robust_old(content)

    print("Running universal detection...\n")
    new_sections = detect_sections_robust_universal(content)

    print(f"\n📊 Comparison Results:\n")
    print(f"  Old detection: {len(old_sections)} sections\n")
    print(f"  Universal detection: {len(new_sections)} sections\n")
    print(f"  Improvement: +{len(new_sections) - len(old_sections)} sections\n")

    print(f"\n📋 Old Sections:\n")
    for i, section in enumerate(old_sections):
        print(f"  {i+1}. {section.title}\n")

    print(f"\n📋 Universal Sections:\n")
    for i, section in enumerate(new_sections):
        print(f"  {i+1}. {section.title}\n")

    return old_sections, new_sections

def quick_pattern_test_fixed():
    """Quick test to see what patterns match in your content"""
    test_file = "processed_filings/AAPL/AAPL_10K_2020-10-30.txt"

    if not os.path.exists(test_file):
        print("Test file not found\n")
        return

    print("🔍 QUICK PATTERN TEST\n")
    print("="*50)

    with open(test_file, 'r', encoding='utf-8') as f:
        content = f.read()

    patterns = [
        (re.compile(r'\[TABLE_START\](?:.|\n)*?Item(?:.|\n)*?\[TABLE_END\]', re.I | re.DOTALL), "Table-wrapped Items"),
        (re.compile(r'Item\s+\d+[A-C]?\.\s*\|', re.I), "Pipe-separated Items"),
        (re.compile(r'PART\s+[IVX]+', re.I), "Part headers"),
        (re.compile(r'\[TABLE_START\](?:.|\n)*?PART(?:.|\n)*?\[TABLE_END\]', re.I | re.DOTALL), "Table-wrapped Parts"),
    ]

    for compiled_pattern, description in patterns:
        matches = compiled_pattern.findall(content)
        print(f"\n{description}: {len(matches)} matches\n")
        for i, match in enumerate(matches[:3]):
            clean_match = ' '.join(match.split())[:100]
            print(f"  {i+1}: {clean_match}...\n")

# Run the fixed tests
results_universal = test_universal_detection_fixed()
old_vs_new_sections = compare_old_vs_universal_fixed()
quick_pattern_test_fixed()

INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 0 unique sections:
INFO:__main__:Found table of contents (1367 chars)
INFO:__main__:Extracted 0 sections from table of contents:
INFO:__main__:Found 1 sections in AAPL_10K_2020-10-30.txt
INFO:__main__:Created 172 chunks for AAPL_10K_2020-10-30.txt
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 0 unique sections:
INFO:__main__:Found table of contents (1367 chars)
INFO:__main__:Extracted 0 sections from table of contents:
INFO:__main__:Found 1 sections in AAPL_10K_2020-10-30.txt
INFO:__main__:Created 262 chunks for AAPL_10K_2020-10-30.txt
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 0 unique sections:
INFO:__main__:Found table of contents (1367 chars)
INFO:__main__:Extracted 0 sections from table of contents:
INFO:__main__:Found 1 sections in AAPL_10K_2020-10-30.txt


🚀 SEC Filing Preprocessing Strategy - Ready for Testing!

Key improvements over original approach:

✅ Multi-strategy section detection with fallbacks

✅ Sentence-aware chunking with overlap

✅ Robust error handling and logging

✅ Structured data classes for better organization

✅ Quality validation and statistics

✅ Separate table and narrative processing

🧪 Testing with: processed_filings/AAPL/AAPL_10K_2020-10-30.txt

📊 Processing Results:

  total_chunks: 172

  avg_tokens: 379.86046511627904

  min_tokens: 38

  max_tokens: 1692

  chunks_with_overlap: 105

  table_chunks: 66

  narrative_chunks: 106

  unique_sections: 1


📝 Sample Chunks:


Chunk 1 (table):

  Section: Full Document

  Tokens: 58

  Text preview: California | 94-2404110 | (State or other jurisdiction | of incorporation or organization) | (I.R.S. Employer Identification No.) | One Apple Park Way | Cupertino | , | California | 95014 | (Address o...


Chunk 2 (table):

  Section: Full Document

  Tokens: 240

  Text 

INFO:__main__:Created 172 chunks for AAPL_10K_2020-10-30.txt
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 0 unique sections:
INFO:__main__:Found table of contents (1367 chars)
INFO:__main__:Extracted 0 sections from table of contents:
INFO:__main__:Found 1 sections in AAPL_10K_2020-10-30.txt
INFO:__main__:Created 127 chunks for AAPL_10K_2020-10-30.txt
ERROR:__main__:Error processing non_existent_file.txt: Unknown datetime string format, unable to parse: file, at position 0
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:Empty content provided to detect_sections_universal_sec. Returning empty sections.
INFO:__main__:Empty content provided to detect_sections_from_toc_universal. Returning empty sections.
ERROR:__main__:Error processing /var/folders/pj/bmp5122d3d77bzq_cvf0wbl40000gn/T/tmp5dz7b037_bad_name.txt: Unknown datetime string format, unable to parse: name, at position 0
INFO:__main__:Attempting univers

  Total chunks: 172

  Avg tokens: 379.9

  Overlap rate: 105/172


🧪 Testing: Large chunks, high overlap

  Total chunks: 127

  Avg tokens: 495.8

  Overlap rate: 60/127

🛡️ Testing Error Handling

Test 1: Non-existent file

  Result: 0 chunks (expected 0)


Test 2: Empty content

  Result: 1 sections


Test 3: Malformed filename

  Result: 0 chunks (expected 0)


Test 4: Very short text

  Result: 0 chunks

🔄 Testing Batch Processing (max 3 files)

Processing 3 files...

  1/3: AMZN_10Q_2022-04-29.txt

  2/3: AMZN_10Q_2020-05-01.txt

  3/3: AMZN_10Q_2020-10-30.txt


📊 Batch Processing Summary:

  Total files processed: 3

  Total chunks created: 440

  Average chunks per file: 146.7


📋 Per-file results:

  AMZN_10Q_2022-04-29.txt: 125 chunks, 1 sections, 51 tables

  AMZN_10Q_2020-05-01.txt: 195 chunks, 1 sections, 131 tables

  AMZN_10Q_2020-10-30.txt: 120 chunks, 1 sections, 48 tables

📈 Final Analysis Summary

🎯 Key Insights:

  • Document: AAPL 10K (FY2020)

  • Total chunks: 1

INFO:__main__:  1: Item/Part I - Item 1.    Business...
INFO:__main__:  2: Item/Part 1A - Risk Factors...
INFO:__main__:  3: Item/Part 1B - Unresolved Staff Comments...
INFO:__main__:  4: Item/Part 3 - Legal Proceedings...
INFO:__main__:  5: Item/Part 4 - Mine Safety Disclosures...
INFO:__main__:  6: Item/Part 6 - Selected Financial Data...
INFO:__main__:  7: Item/Part 7 - Management’s Discussion and Analysis of Financial Condition ...
INFO:__main__:  8: Item/Part 7A - Quantitative and Qualitative Disclosures About Market Risk...
INFO:__main__:  9: Item/Part 8 - Financial Statements and Supplementary Data...
INFO:__main__:  10: Item/Part 9 - Changes in and Disagreements with Accountants on Accounting ...
INFO:__main__:  11: Item/Part 9B - Other Information...
INFO:__main__:  12: Item/Part 11 - Executive Compensation...
INFO:__main__:  13: Item/Part 12 - Security Ownership of Certain Beneficial Owners and Manageme...
INFO:__main__:  14: Item/Part 13 - Certain Relationships and Related T


✅ Found 17 sections:

  1. Item 1.    Business

     Type: part, Length: 13,274 chars

  2. Risk Factors

     Type: item, Length: 61,136 chars

  3. Unresolved Staff Comments

     Type: item, Length: 582 chars

  4. Legal Proceedings

     Type: item, Length: 898 chars

  5. Mine Safety Disclosures

     Type: item, Length: 4,292 chars

  6. Selected Financial Data

     Type: item, Length: 1,745 chars

  7. Management’s Discussion and Analysis of Financial Condition and Results of Operations

     Type: item, Length: 33,154 chars

  8. Quantitative and Qualitative Disclosures About Market Risk

     Type: item, Length: 6,799 chars

  9. Financial Statements and Supplementary Data

     Type: item, Length: 103,042 chars

  10. Changes in and Disagreements with Accountants on Accounting and Financial Disclosure

     Type: item, Length: 4,635 chars


📊 Processing Results:

  total_chunks: 172

  avg_tokens: 379.86046511627904

  min_tokens: 38

  max_tokens: 1692

  chunks_with_overl

INFO:__main__:  1: Item/Part I - [TABLE_START]...
INFO:__main__:  2: Item/Part II - [TABLE_START]...
INFO:__main__:  3: Item/Part III - [TABLE_START]...
INFO:__main__:  4: Item/Part IV - [TABLE_START]...
INFO:__main__:Universal detection successful (Strategy 1): Found 4 sections.
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 0 unique sections:
INFO:__main__:Found table of contents (1441 chars)
INFO:__main__:Extracted 0 sections from table of contents:
INFO:__main__:Found 1 sections in AMZN_10K_2023-02-03.txt
INFO:__main__:Created 210 chunks for AMZN_10K_2023-02-03.txt
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 2 unique sections:
INFO:__main__:  1: Item/Part I - . FINANCIAL INFORMATION...
INFO:__main__:  2: Item/Part II - . OTHER INFORMATION...



✅ Found 4 sections:

  1. [TABLE_START]

     Type: part, Length: 71,104 chars

  2. [TABLE_START]

     Type: part, Length: 189,316 chars

  3. [TABLE_START]

     Type: part, Length: 2,224 chars

  4. [TABLE_START]

     Type: part, Length: 10,492 chars


📊 Processing Results:

  total_chunks: 210

  avg_tokens: 332.1666666666667

  min_tokens: 6

  max_tokens: 1157

  chunks_with_overlap: 119

  table_chunks: 90

  narrative_chunks: 120

  unique_sections: 1


📚 Section Distribution (sample):

  • Full Document: 20 chunks


🧪 Testing: processed_filings/AMZN/AMZN_10Q_2024-11-01.txt



INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 0 unique sections:
INFO:__main__:Found table of contents (903 chars)
INFO:__main__:Extracted 0 sections from table of contents:
INFO:__main__:Found 1 sections in AMZN_10Q_2024-11-01.txt
INFO:__main__:Created 132 chunks for AMZN_10Q_2024-11-01.txt
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 8 unique sections:



✅ Found 1 sections:

  1. Full Document

     Type: document, Length: 187,951 chars


📊 Processing Results:

  total_chunks: 132

  avg_tokens: 366.43939393939394

  min_tokens: 7

  max_tokens: 1548

  chunks_with_overlap: 81

  table_chunks: 50

  narrative_chunks: 82

  unique_sections: 1


📚 Section Distribution (sample):

  • Full Document: 20 chunks


🧪 Testing: processed_filings/KO/KO_10Q_2020-07-22.txt



INFO:__main__:  1: Item/Part I - . Financial Information...
INFO:__main__:  2: Item/Part 2 - Management's Discussion and Analysis of Financial Condition ...
INFO:__main__:  3: Item/Part 3 - Quantitative and Qualitative Disclosures About Market Risk...
INFO:__main__:  4: Item/Part 4 - Controls and Procedures...
INFO:__main__:  5: Item/Part II - . Other Information...
INFO:__main__:  6: Item/Part 1A - Risk Factors...
INFO:__main__:  7: Item/Part 2 - Unregistered Sales of Equity Securities and Use of Proceeds...
INFO:__main__:  8: Item/Part 6 - Exhibits...
INFO:__main__:Universal detection successful (Strategy 1): Found 8 sections.
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 0 unique sections:
INFO:__main__:Found table of contents (5004 chars)
INFO:__main__:Extracted 0 sections from table of contents:
INFO:__main__:Found 1 sections in KO_10Q_2020-07-22.txt
INFO:__main__:Created 161 chunks for KO_10Q_2020-07-22.txt
INFO:__main__:At


✅ Found 8 sections:

  1. . Financial Information

     Type: part, Length: 115,924 chars

  2. Management's Discussion and Analysis of Financial Condition and Results of Operations

     Type: item, Length: 87,923 chars

  3. Quantitative and Qualitative Disclosures About Market Risk

     Type: item, Length: 207 chars

  4. Controls and Procedures

     Type: item, Length: 1,004 chars

  5. . Other Information

     Type: part, Length: 248 chars

  6. Risk Factors

     Type: item, Length: 11,661 chars

  7. Unregistered Sales of Equity Securities and Use of Proceeds

     Type: item, Length: 2,127 chars

  8. Exhibits

     Type: item, Length: 13,918 chars


📊 Processing Results:

  total_chunks: 161

  avg_tokens: 396.7577639751553

  min_tokens: 32

  max_tokens: 1451

  chunks_with_overlap: 97

  table_chunks: 63

  narrative_chunks: 98

  unique_sections: 1


📚 Section Distribution (sample):

  • Full Document: 20 chunks


📊 UNIVERSAL DETECTION SUMMARY

AAPL_10K_2020-10-30.txt 

INFO:__main__:  1: Item/Part I - Item 1.    Business...
INFO:__main__:  2: Item/Part 1A - Risk Factors...
INFO:__main__:  3: Item/Part 1B - Unresolved Staff Comments...
INFO:__main__:  4: Item/Part 3 - Legal Proceedings...
INFO:__main__:  5: Item/Part 4 - Mine Safety Disclosures...
INFO:__main__:  6: Item/Part 6 - Selected Financial Data...
INFO:__main__:  7: Item/Part 7 - Management’s Discussion and Analysis of Financial Condition ...
INFO:__main__:  8: Item/Part 7A - Quantitative and Qualitative Disclosures About Market Risk...
INFO:__main__:  9: Item/Part 8 - Financial Statements and Supplementary Data...
INFO:__main__:  10: Item/Part 9 - Changes in and Disagreements with Accountants on Accounting ...
INFO:__main__:  11: Item/Part 9B - Other Information...
INFO:__main__:  12: Item/Part 11 - Executive Compensation...
INFO:__main__:  13: Item/Part 12 - Security Ownership of Certain Beneficial Owners and Manageme...
INFO:__main__:  14: Item/Part 13 - Certain Relationships and Related T


📊 Comparison Results:

  Old detection: 19 sections

  Universal detection: 17 sections

  Improvement: +-2 sections


📋 Old Sections:

  1. Part I

  2. Item 1A

  3. Item 1B

  4. Item 3

  5. Item 4

  6. Item 6

  7. Item 7

  8. Item 7A

  9. Item 8

  10. Notes to Consolidated Financial Statements

  11. Opinion on the Financial Statements

  12. Item 9

  13. Item 9B

  14. Item 11

  15. Item 12

  16. Item 13

  17. Item 14

  18. Part IV

  19. Item 16


📋 Universal Sections:

  1. Item 1.    Business

  2. Risk Factors

  3. Unresolved Staff Comments

  4. Legal Proceedings

  5. Mine Safety Disclosures

  6. Selected Financial Data

  7. Management’s Discussion and Analysis of Financial Condition and Results of Operations

  8. Quantitative and Qualitative Disclosures About Market Risk

  9. Financial Statements and Supplementary Data

  10. Changes in and Disagreements with Accountants on Accounting and Financial Disclosure

  11. Other Information

  12. Executive Compen

In [34]:
import os
import re
import pandas as pd
import tiktoken
from typing import List, Dict, Any, Tuple, Optional
from dataclasses import dataclass
from datetime import datetime
import logging
from pathlib import Path

# Set up logging to see what's happening
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Initialize tokenizer for accurate token counting
encoding = tiktoken.encoding_for_model("text-embedding-3-small")

# =============================================================================
# 1. SEC MAPPINGS WITH FALLBACKS
# =============================================================================

ITEM_NAME_MAP_10K = {
    "1": "Business",
    "1A": "Risk Factors",
    "1B": "Unresolved Staff Comments",
    "1C": "Cybersecurity",
    "2": "Properties",
    "3": "Legal Proceedings",
    "4": "Mine Safety Disclosures",
    "5": "Market for Registrant's Common Equity, Related Stockholder Matters and Issuer Purchases of Equity Securities",
    "6": "Reserved",
    "7": "Management's Discussion and Analysis of Financial Condition and Results of Operations",
    "7A": "Quantitative and Qualitative Disclosures About Market Risk",
    "8": "Financial Statements and Supplementary Data",
    "9": "Changes in and Disagreements With Accountants on Accounting and Financial Disclosure",
    "9A": "Controls and Procedures",
    "9B": "Other Information",
    "9C": "Disclosure Regarding Foreign Jurisdictions that Prevent Inspections",
    "10": "Directors, Executive Officers and Corporate Governance",
    "11": "Executive Compensation",
    "12": "Security Ownership of Certain Beneficial Owners and Management and Related Stockholder Matters",
    "13": "Certain Relationships and Related Transactions, and Director Independence",
    "14": "Principal Accountant Fees and Services",
    "15": "Exhibits, Financial Statement Schedules",
    "16": "Form 10-K Summary"
}

ITEM_NAME_MAP_10Q_PART_I = {
    "1": "Financial Statements",
    "2": "Management's Discussion and Analysis of Financial Condition and Results of Operations",
    "3": "Quantitative and Qualitative Disclosures About Market Risk",
    "4": "Controls and Procedures",
}

ITEM_NAME_MAP_10Q_PART_II = {
    "1": "Legal Proceedings", "1A": "Risk Factors",
    "2": "Unregistered Sales of Equity Securities and Use of Proceeds",
    "3": "Defaults Upon Senior Securities", "4": "Mine Safety Disclosures",
    "5": "Other Information", "6": "Exhibits",
}

# =============================================================================
# 2. DATA STRUCTURES FOR BETTER ORGANIZATION
# =============================================================================

@dataclass
class FilingMetadata:
    """Structured metadata for a filing"""
    ticker: str
    form_type: str
    filing_date: str
    fiscal_year: int
    fiscal_quarter: int
    file_path: str

@dataclass
class DocumentSection:
    """Represents a section of the document"""
    title: str
    content: str
    section_type: str  # 'item', 'part', 'intro', 'table'
    item_number: Optional[str] = None
    part: Optional[str] = None
    start_pos: int = 0
    end_pos: int = 0

@dataclass
class Chunk:
    """Final chunk with all metadata"""
    chunk_id: str
    text: str
    token_count: int
    chunk_type: str  # 'narrative', 'table', 'mixed'
    section_info: str
    filing_metadata: FilingMetadata
    chunk_index: int
    has_overlap: bool = False

# =============================================================================
# 3. ROBUST TEXT CLEANING
# =============================================================================

def clean_sec_text(text: str) -> str:
    """
    Clean SEC filing text more robustly
    """
    # Remove common SEC artifacts
    text = re.sub(r'UNITED STATES\s+SECURITIES AND EXCHANGE COMMISSION.*?FORM \d+[A-Z]*', '', text, flags=re.DOTALL | re.IGNORECASE)

    # Handle page breaks more intelligently
    text = text.replace('[PAGE BREAK]', '\n\n--- PAGE BREAK ---\n\n')

    # Preserve table boundaries but clean them up
    text = re.sub(r'\[TABLE_START\]', '\n\n=== TABLE START ===\n', text)
    text = re.sub(r'\[TABLE_END\]', '\n=== TABLE END ===\n\n', text)

    # Clean up excessive whitespace but preserve paragraph structure
    text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text)  # Multiple newlines -> double newline
    text = re.sub(r'[ \t]+', ' ', text)  # Multiple spaces/tabs -> single space
    text = re.sub(r'^\s+|\s+$', '', text, flags=re.MULTILINE)  # Trim lines

    return text.strip()

# =============================================================================
# 4. MULTI-STRATEGY SECTION DETECTION
# =============================================================================

def detect_sections_strategy_1_improved(content: str) -> List[DocumentSection]:
    """
    Improved Strategy 1: Patterns based on real SEC filing structure
    """
    sections = []

    # Much more comprehensive patterns based on your actual files
    patterns = [
        # PART patterns - handle various formats
        re.compile(r'^\s*PART\s+([IVX]+)(?:\s*[-–—].*?)?$', re.I | re.M),
        re.compile(r'^PART\s+([IVX]+)(?:\s*[-–—].*?)?$', re.I | re.M),

        # ITEM patterns - much more flexible
        re.compile(r'^\s*ITEM\s+(\d{1,2}[A-C]?)(?:[.\s–—])', re.I | re.M),
        re.compile(r'^ITEM\s+(\d{1,2}[A-C]?)(?:[.\s–—])', re.I | re.M),
        re.compile(r'Item\s+(\d{1,2}[A-C]?)(?:[.\s–—])', re.I | re.M),

        # Number-dot format common in SEC filings
        re.compile(r'^(\d{1,2}[A-C]?)\.\s+[A-Z][A-Za-z\s]{10,}', re.I | re.M),

        # Content-based patterns for known sections
        re.compile(r'^.{0,50}(BUSINESS)\s*$', re.I | re.M),
        re.compile(r'^.{0,50}(RISK FACTORS)\s*$', re.I | re.M),
        re.compile(r'^.{0,50}(LEGAL PROCEEDINGS)\s*$', re.I | re.M),
        re.compile(r'^.{0,50}(FINANCIAL STATEMENTS)\s*$', re.I | re.M),
        re.compile(r'^.{0,50}(MANAGEMENT.S DISCUSSION)\s*', re.I | re.M),
        re.compile(r'^.{0,50}(PROPERTIES)\s*$', re.I | re.M),
        re.compile(r'^.{0,50}(CONTROLS AND PROCEDURES)\s*$', re.I | re.M),
    ]

    all_matches = []

    # Process each pattern
    for pattern_idx, pattern in enumerate(patterns):
        for match in pattern.finditer(content): # Use pre-compiled pattern
            # Get the full line containing this match
            line_start = content.rfind('\n', 0, match.start()) + 1
            line_end = content.find('\n', match.end())
            if line_end == -1:
                line_end = len(content)

            full_line = content[line_start:line_end].strip()

            # Filter out obvious false positives
            if (len(full_line) > 400 or  # Too long to be a header
                len(full_line) < 3 or    # Too short
                '|' in full_line or      # Likely table content
                full_line.count(' ') > 20):  # Too many words
                continue

            # Extract section identifier
            section_id = match.group(1) if match.groups() else 'unknown'

            all_matches.append({
                'start_pos': line_start, # Changed from match.start() for consistency with line-based detection
                'end_pos': line_end,     # Changed from match.end()
                'full_line': full_line,
                'section_id': section_id,
                'pattern_idx': pattern_idx,
                'match_start': match.start()
            })

    # Remove duplicates - matches within 200 characters of each other
    unique_matches = []
    for match in sorted(all_matches, key=lambda x: x['start_pos']):
        is_duplicate = any(
            abs(match['start_pos'] - existing['start_pos']) < 200
            for existing in unique_matches
        )
        if not is_duplicate:
            unique_matches.append(match)

    # Debug output
    print(f"🔍 Improved detection found {len(unique_matches)} potential sections:")
    for i, match in enumerate(unique_matches[:15]):  # Show more for debugging
        print(f"  {i+1}: {match['full_line'][:80]}...")

    # Convert to DocumentSection objects
    for i, match in enumerate(unique_matches):
        start_pos = match['start_pos']
        end_pos = unique_matches[i + 1]['start_pos'] if i + 1 < len(unique_matches) else len(content)

        section_content = content[start_pos:end_pos].strip()

        # Determine section type and metadata
        full_line_upper = match['full_line'].upper()
        section_id = match['section_id'].upper() if match['section_id'] != 'unknown' else None

        if 'PART' in full_line_upper and section_id:
            section_type = 'part'
            part = f"PART {section_id}"
            item_number = None
            title = f"Part {section_id}"
        elif ('ITEM' in full_line_upper or re.match(r'^\d+[A-C]?$', str(section_id))) and section_id:
            section_type = 'item'
            part = None
            item_number = section_id
            title = f"Item {section_id}"
        elif any(keyword in full_line_upper for keyword in
                ['BUSINESS', 'RISK', 'LEGAL', 'FINANCIAL', 'MANAGEMENT', 'PROPERTIES', 'CONTROLS']):
            section_type = 'named_section'
            part = None
            item_number = None
            title = match['full_line']
        else:
            section_type = 'content'
            part = None
            item_number = None
            title = match['full_line']

        sections.append(DocumentSection(
            title=title,
            content=section_content,
            section_type=section_type,
            item_number=item_number,
            part=part,
            start_pos=start_pos,
            end_pos=end_pos
        ))

    return sections

def detect_sections_strategy_2(content: str) -> List[DocumentSection]:
    """
    Strategy 2: Fallback using page breaks and heuristics
    """
    sections = []

    # Split by page breaks first
    pages = content.split('--- PAGE BREAK ---')

    current_section = ""
    current_title = "Document Content"

    for i, page in enumerate(pages):
        page = page.strip()
        if not page:
            continue

        # Look for section headers in the page
        lines = page.split('\n')
        potential_headers = []

        for j, line in enumerate(lines[:10]):  # Check first 10 lines of each page
            line = line.strip()
            if (len(line) < 100 and  # Headers are usually short
                (re.search(r'\b(ITEM|PART)\b', line, re.IGNORECASE) or
                 re.search(r'\b(BUSINESS|RISK FACTORS|FINANCIAL STATEMENTS)\b', line, re.IGNORECASE))):
                potential_headers.append((j, line))

        if potential_headers:
            # Found a header, start new section
            if current_section:
                sections.append(DocumentSection(
                    title=current_title,
                    content=current_section.strip(),
                    section_type='content',
                    start_pos=0,
                    end_pos=len(current_section)
                ))

            current_title = potential_headers[0][1]
            current_section = page
        else:
            # Continue current section
            current_section += "\n\n" + page

    # Add the last section
    if current_section:
        sections.append(DocumentSection(
            title=current_title,
            content=current_section.strip(),
            section_type='content',
            start_pos=0,
            end_pos=len(current_section)
        ))

    return sections

# The `detect_sections_robust` function from your original code (renamed detect_sections_robust_old to avoid conflict)
def detect_sections_robust_old(content: str) -> List[DocumentSection]:
    """
    Multi-strategy section detection with fallbacks (original version)
    """
    logger.info("Attempting Strategy 1: Regex-based section detection")
    sections = detect_sections_strategy_1_improved(content) # Original called detect_sections_strategy_1, updated to _improved

    if len(sections) >= 3:  # A reasonable number of sections to consider it successful
        logger.info(f"Strategy 1 successful: Found {len(sections)} sections")
        return sections

    logger.warning("Strategy 1 failed, trying Strategy 2: Page-based detection")
    sections = detect_sections_strategy_2(content)

    if len(sections) >= 2:
        logger.info(f"Strategy 2 successful: Found {len(sections)} sections")
        return sections

    logger.warning("All strategies failed, creating single section")
    return [DocumentSection(
        title="Full Document",
        content=content,
        section_type='document',
        start_pos=0,
        end_pos=len(content)
    )]

def create_section_info(section: DocumentSection, form_type: str) -> str:
    """
    Create human-readable section information for DocumentSection objects,
    using form_type to select the correct item name map.
    """
    item_number = section.item_number
    section_type = section.section_type
    part_number = section.part # Get part from DocumentSection

    if section_type == 'item' and item_number:
        if form_type == '10K':
            item_name = ITEM_NAME_MAP_10K.get(item_number, "Unknown Section")
            return f"Item {item_number} - {item_name}"
        elif form_type == '10Q':
            # Use part_number from DocumentSection if available, or try to infer from item_number
            if part_number == 'PART I' or item_number in ITEM_NAME_MAP_10Q_PART_I:
                item_name = ITEM_NAME_MAP_10Q_PART_I.get(item_number, "Unknown Section")
                return f"Part I, Item {item_number} - {item_name}"
            elif part_number == 'PART II' or item_number in ITEM_NAME_MAP_10Q_PART_II:
                item_name = ITEM_NAME_MAP_10Q_PART_II.get(item_number, "Unknown Section")
                return f"Part II, Item {item_number} - {item_name}"
            else:
                return f"Item {item_number} - Unknown 10Q Section"
    
    elif section_type == 'part' and part_number: # Use part_number from DocumentSection
        # Check if the part title also contains an item number and include it
        if "Item" in section.title and section.item_number:
            return f"{part_number} - Item {section.item_number}"
        return part_number

    # Fallback for named_section, content, or document type sections
    return section.title or "Document Content"


def detect_sections_universal_sec(content: str) -> List[DocumentSection]:
    """
    Universal section detection for SEC filings with table-based formatting
    """
    sections = []

    if not content: # Added check for empty content
        logger.info("Empty content provided to detect_sections_universal_sec. Returning empty sections.")
        return sections

    # Universal patterns for table-formatted SEC filings
    # Using re.escape for literal brackets, and compiling patterns once.
    # Corrected regex for pattern components
    patterns = [
        re.compile(re.escape('[TABLE_START]') + r'.*?Item\s*(\d{1,2}[A-C]?)\.\s*\|\s*([^\[]+?)\s*' + re.escape('[TABLE_END]'), re.I | re.DOTALL),
        re.compile(re.escape('[TABLE_START]') + r'.*?Item\s*(\d{1,2}[A-C]?)\.\s*\|\s*([^|]+)', re.I | re.DOTALL),

        re.compile(re.escape('[TABLE_START]') + r'.*?PART\s*([IVX]+)\s*\|\s*([^\[]+?)\s*' + re.escape('[TABLE_END]'), re.I | re.DOTALL),
        re.compile(re.escape('[TABLE_START]') + r'.*?PART\s*([IVX]+)\s*\|\s*([^|]+)', re.I | re.DOTALL),
        re.compile(re.escape('[TABLE_START]') + r'.*?PART\s*([IVX]+)\s*' + re.escape('[TABLE_END]'), re.I | re.DOTALL),

        # Standalone ITEM patterns (fallback) - no re.escape for Item/Part
        re.compile(r'^\s*Item\s*(\d{1,2}[A-C]?)\.\s*([^\n]+)', re.I | re.M),
        re.compile(r'Item\s*(\d{1,2}[A-C]?)\.\s*\|\s*([^|]+)', re.I | re.DOTALL),

        # Standalone PART patterns (fallback) - no re.escape for Item/Part
        re.compile(r'^\s*PART\s*([IVX]+)\s*([^\n]*)', re.I | re.M),
        re.compile(r'PART\s*([IVX]+)\s*\|\s*([^|]+)', re.I | re.DOTALL),

        # Number-only patterns in tables - using re.escape for markers
        re.compile(re.escape('[TABLE_START]') + r'.*?(\d{1,2}[A-C]?)\.\s*\|\s*([^|]+)', re.I | re.DOTALL),

        # Headers that might appear standalone - no re.escape for Item/Part
        re.compile(r'^(Item\s*\d{1,2}[A-C]?\.\s+[^|]+?)$', re.I | re.M),
        re.compile(r'^(PART\s*[IVX]+)(?:\s*[-–—]\s*(.+))?$', re.I | re.M),
    ]

    all_matches = []

    for pattern_idx, pattern in enumerate(patterns):
        for match in pattern.finditer(content): # Use pre-compiled pattern
            # Get context around the match (full line)
            line_start = content.rfind('\n', 0, match.start()) + 1
            line_end = content.find('\n', match.end())
            if line_end == -1:
                line_end = len(content)

            full_line = content[line_start:line_end].strip()

            # Filter out obvious false positives
            if (len(full_line) > 400 or  # Too long to be a header
                len(full_line) < 3 or    # Too short
                '|' in full_line or      # Likely table content
                full_line.count(' ') > 20):  # Too many words
                continue

            # Extract section information
            groups = match.groups()

            section_id = groups[0].strip() if groups else 'unknown'
            section_title = ""

            if len(groups) >= 2 and groups[1]:
                section_title = groups[1].strip()
                # Clean up section title from table markers - use actual markers here
                section_title = re.sub(re.escape('[TABLE_END]') + r'.*', '', section_title, flags=re.I).strip()
                section_title = section_title.replace('|', '').strip()
            elif len(groups) == 1:
                line_after_id_match = content[match.end():].split('\n')[0].strip()
                if line_after_id_match:
                    section_title = line_after_id_match
                else:
                    section_title = f"Section {section_id}"
            else:
                section_title = full_line

            all_matches.append({
                'start_pos': match.start(),
                'end_pos': match.end(),
                'full_line': full_line,
                'section_id': section_id,
                'section_title': section_title,
                'pattern_idx': pattern_idx,
                'match_start': match.start()
            })

    # Sort matches by their start position
    all_matches.sort(key=lambda x: x['start_pos'])

    # Remove duplicates and prioritize 'better' matches
    final_matches = []
    if all_matches:
        final_matches.append(all_matches[0]) # Add the first match
        for i in range(1, len(all_matches)):
            current_match = all_matches[i]
            last_added_match = final_matches[-1]

            # Heuristic for deciding whether to add or replace:
            # 1. If current match is significantly after the last added match, add it.
            if current_match['start_pos'] - last_added_match['start_pos'] > 150: # Increased distance for new section
                final_matches.append(current_match)
            # 2. If current match is very close, but provides a more specific 'item' or 'part' ID
            #    and the last added one was generic 'unknown' or less specific.
            elif current_match['start_pos'] - last_added_match['start_pos'] < 50: # Close proximity
                if last_added_match['section_id'] == 'unknown' and current_match['section_id'] != 'unknown':
                    final_matches[-1] = current_match # Replace with more specific match
                elif last_added_match['section_id'] == current_match['section_id'] and last_added_match['pattern_idx'] > current_match['pattern_idx']:
                    # If same ID but new pattern has higher priority (lower index means earlier in list)
                    final_matches[-1] = current_match

    logger.info(f"🔍 Universal SEC detection found {len(final_matches)} unique sections:")
    for i, match in enumerate(final_matches[:15]):
        logger.info(f"  {i+1}: Item/Part {match['section_id']} - {match['section_title'][:60]}...")

    # Convert to DocumentSection objects
    final_document_sections = []
    current_part = None # Track current part for 10Q item context

    for i, match in enumerate(final_matches):
        start_pos = match['start_pos']
        # End position is the start of the next matched section, or end of content if it's the last one
        end_pos = final_matches[i + 1]['start_pos'] if i + 1 < len(final_matches) else len(content)

        section_content = content[start_pos:end_pos].strip()

        section_id = match['section_id'].upper()
        title = match['section_title']

        section_type = 'content'
        item_number = None
        part = None

        if re.match(r'^[IVX]+$', section_id):
            section_type = 'part'
            part = f"PART {section_id}"
            current_part = part # Update current part
            if title.upper().startswith("PART ") and title.upper().replace("PART ", "").strip() == section_id:
                title = part
            elif not title:
                 title = part
        elif re.match(r'^\d+[A-C]?$', section_id):
            section_type = 'item'
            item_number = section_id
            part = current_part # Inherit part from the last detected PART
            if title.upper().startswith("ITEM ") and title.upper().replace("ITEM ", "").strip() == section_id:
                title = f"Item {item_number}"
            elif not title:
                 title = f"Item {item_number}"

        final_document_sections.append(DocumentSection(
            title=title,
            content=section_content,
            section_type=section_type,
            item_number=item_number,
            part=part, # Store the part info
            start_pos=start_pos,
            end_pos=end_pos
        ))

    return final_document_sections

def detect_sections_from_toc_universal(content: str) -> List[DocumentSection]:
    """
    Extract sections from table of contents - works for any SEC filing.
    This function primarily identifies section titles and item numbers from TOC,
    but does not extract their content directly.
    """
    sections = []

    if not content:
        logger.info("Empty content provided to detect_sections_from_toc_universal. Returning empty sections.")
        return sections

    # Look for table of contents patterns
    # Using re.escape for literal brackets, and compiling patterns once.
    toc_patterns = [
        re.compile(r'(?i)INDEX.*?(?=\s*--- PAGE BREAK ---)', re.DOTALL),
        re.compile(r'(?i)TABLE OF CONTENTS.*?(?=\s*--- PAGE BREAK ---)', re.DOTALL),
        re.compile(r'(?i)FORM 10-[KQ].*?INDEX.*?(?=\s*--- PAGE BREAK ---)', re.DOTALL),
        re.compile(re.escape('[TABLE_START]') + r'.*?Page.*?' + re.escape('[TABLE_END]') + r'.*?(?=\s*--- PAGE BREAK ---)', re.DOTALL),
    ]

    toc_content = ""
    for pattern in toc_patterns:
        match = pattern.search(content)
        if match:
            toc_content = match.group(0)
            break

    if not toc_content:
        logger.warning("No table of contents found in detect_sections_from_toc_universal.")
        return sections

    logger.info(f"Found table of contents ({len(toc_content)} chars)")

    # Define patterns for items/parts within the TOC
    # CORRECTED: Removed re.escape from words like "Item" and "PART" within the patterns.
    # Added flexibility for optional periods and variations in spacing.
    item_patterns = [
        # Match Item X. [Title] | Page
        re.compile(r'(?i)Item\s*(\d{1,2}[A-C]?)\.?\s*\|\s*([^|]+?)\s*\|\s*\\d+', re.DOTALL),
        # Match PART X | [Title]
        re.compile(r'(?i)PART\s*([IVX]+)\s*\|\s*([^|]+)', re.DOTALL),
        # Match Item X. [Title] (no pipe/page)
        re.compile(r'(?i)Item\s*(\d{1,2}[A-C]?)\.?\s*([^\n|]+)', re.M),
        # Match X. | [Title] | Page (number-dot format in table)
        re.compile(r'(?i)(\d{1,2}[A-C]?)\.?\s*\|\s*([^|]+?)\s*\|\s*\\d+', re.DOTALL),
        # Match simple PART X line
        re.compile(r'(?i)PART\s*([IVX]+)', re.M)
    ]

    found_items = []
    # This loop also needs to handle empty toc_content if previous steps fail
    if not toc_content: # Defensive check
        return sections

    for pattern in item_patterns:
        for match in pattern.finditer(toc_content):
            groups = match.groups()
            item_id = None
            item_title = ""

            if len(groups) >= 2: # Pattern captured both ID and Title
                item_id = groups[0].strip()
                item_title = groups[1].strip()
            elif len(groups) == 1: # Pattern only captured ID (e.g., simple PART)
                item_id = groups[0].strip()
                # Attempt to get text immediately following the item_id if available, otherwise use a generic title
                line_after_id_match = toc_content[match.end():].split('\n')[0].strip()
                if line_after_id_match:
                    item_title = line_after_id_match
                else:
                    item_title = f"Section {item_id}"

            if item_id: # Only add if we successfully got an ID
                item_title = re.sub(r'\\s+', ' ', item_title).strip() # Normalize whitespace
                found_items.append((item_id, item_title))


    unique_items = []
    seen = set()
    for item_id, title in found_items:
        key = f"{item_id}_{title[:50]}" # Use a longer slice for better uniqueness
        if key not in seen:
            unique_items.append((item_id, title))
            seen.add(key)

    logger.info(f"Extracted {len(unique_items)} sections from table of contents:")
    for item_id, title in unique_items[:10]:
        logger.info(f"  • {item_id}: {title[:50]}...")

    toc_sections = []
    current_part = None # Track current part for TOC items

    for item_id, title in unique_items:
        section_type = 'unknown'
        item_number = None
        part_num = None

        if re.match(r'^\d+[A-C]?$', item_id):
            section_type = 'item'
            item_number = item_id
            part_num = current_part # Assign current part context
        elif re.match(r'^[IVX]+$', item_id):
            section_type = 'part'
            part_num = f"PART {item_id}"
            current_part = part_num # Update current part
        else:
            section_type = 'content' # Treat as generic content section if no item/part found

        toc_sections.append(DocumentSection(
            title=title,
            content="", # Content is intentionally empty here; will be filled by main sectioning if this strategy is chosen.
            section_type=section_type,
            item_number=item_number,
            part=part_num # Store the identified part
        ))
    return toc_sections


def detect_sections_robust_universal(content: str) -> List[DocumentSection]:
    """
    Universal robust section detection for all SEC filings.
    Prioritizes direct pattern matching (which handles tables well), then TOC, then page-based.
    """
    logger.info("Attempting universal SEC section detection")

    # Strategy 1: Direct pattern matching for sections (designed to work well with common SEC patterns)
    sections_strategy1 = detect_sections_universal_sec(content)

    if len(sections_strategy1) >= 3:
        logger.info(f"Universal detection successful (Strategy 1): Found {len(sections_strategy1)} sections.")
        return sections_strategy1

    # Strategy 2: Try parsing Table of Contents.
    logger.warning("Direct detection found few sections, analyzing table of contents.")
    toc_entries = detect_sections_from_toc_universal(content) # These are DocumentSections with only title/metadata, no content

    if toc_entries and len(toc_entries) >= 3:
        logger.info(f"TOC analysis found {len(toc_entries)} potential sections. Attempting to extract content based on TOC titles.")

        combined_sections = []
        current_content_pos = 0

        for i, toc_entry in enumerate(toc_entries):
            # Create flexible regex for the title/item number to find it in the main content
            pattern_parts = []
            if toc_entry.item_number:
                pattern_parts.append(r'Item\s*' + re.escape(toc_entry.item_number) + r'\.?') # "Item 1." or "Item 1A"
            if toc_entry.part:
                pattern_parts.append(r'PART\s*' + re.escape(toc_entry.part.replace("PART ", ""))) # "PART I"
            
            # Use the full title as a fallback if item/part number not found in text
            if toc_entry.title:
                pattern_parts.append(re.escape(toc_entry.title).replace('\\ ', '\\s*'))

            if not pattern_parts: # Should not happen if TOC parsing is good
                continue

            search_pattern = re.compile(r'(?i)^\s*(?:' + '|'.join(pattern_parts) + r')', re.M)
            
            match = search_pattern.search(content, pos=current_content_pos)

            if match:
                start_pos = match.start()

                next_start_pos = len(content)
                if i + 1 < len(toc_entries):
                    next_toc_entry = toc_entries[i+1]
                    next_pattern_parts = []
                    if next_toc_entry.item_number:
                        next_pattern_parts.append(r'Item\s*' + re.escape(next_toc_entry.item_number) + r'\.?')
                    if next_toc_entry.part:
                        next_pattern_parts.append(r'PART\s*' + re.escape(next_toc_entry.part.replace("PART ", "")))
                    if next_toc_entry.title:
                        next_pattern_parts.append(re.escape(next_toc_entry.title).replace('\\ ', '\\s*'))

                    if next_pattern_parts:
                        next_pattern = re.compile(r'(?i)^\s*(?:' + '|'.join(next_pattern_parts) + r')', re.M)
                        next_match = next_pattern.search(content, pos=match.end())
                        if next_match:
                            next_start_pos = next_match.start()

                section_content = content[start_pos:next_start_pos].strip()

                combined_sections.append(DocumentSection(
                    title=toc_entry.title,
                    content=section_content,
                    section_type=toc_entry.section_type,
                    item_number=toc_entry.item_number,
                    part=toc_entry.part,
                    start_pos=start_pos,
                    end_pos=next_start_pos
                ))
                current_content_pos = next_start_pos
            else:
                logger.warning(f"Could not find content for TOC entry: '{toc_entry.title}'. This section might be merged with previous or skipped.")

        if len(combined_sections) >= 3:
            logger.info(f"Universal detection successful (TOC-based content mapping): Found {len(combined_sections)} sections.")
            return combined_sections
        else:
            logger.warning("TOC-based content mapping yielded few sections. Falling back to page-based detection.")


    # Strategy 3: Page-based fallback (original strategy 2)
    logger.warning("Trying page-based detection as fallback.")
    sections_strategy2 = detect_sections_strategy_2(content)

    if len(sections_strategy2) >= 2:
        logger.info(f"Page-based detection successful: Found {len(sections_strategy2)} sections.")
        return sections_strategy2

    # Final fallback: return the entire document as a single section
    logger.warning("All strategies failed, creating single section.")
    return [DocumentSection(
        title="Full Document",
        content=content,
        section_type='document',
        start_pos=0,
        end_pos=len(content)
    )]


# Helper function to extract metadata from filename
def extract_metadata_from_filename(file_path: str) -> FilingMetadata:
    filename = Path(file_path).name
    file_id = filename.replace(".txt", "")
    parts = file_id.split('_')

    if len(parts) != 3:
        logger.warning(f"Malformed filename: {filename}. Using default metadata.")
        return FilingMetadata(
            ticker="UNKNOWN",
            form_type="UNKNOWN",
            filing_date="1900-01-01",
            fiscal_year=1900,
            fiscal_quarter=1,
            file_path=file_path
        )

    ticker, form_type, filing_date_str = parts

    try:
        filing_date = pd.to_datetime(filing_date_str)
        fiscal_year = filing_date.year
        fiscal_quarter = filing_date.quarter
    except pd.errors.ParserError:
        logger.error(f"Could not parse filing date from {filing_date_str} in {filename}. Using default values.")
        fiscal_year = 1900
        fiscal_quarter = 1

    # Adjust fiscal year for 10-K filings if the filing date is early in the calendar year
    # and typically refers to the previous fiscal year end.
    if form_type == '10K' and filing_date.month <= 3: # Assuming fiscal year ends typically in Dec or Jan-Mar for previous year
        fiscal_year -= 1 # Often a 10K filed in Jan-Mar of current year is for previous fiscal year

    return FilingMetadata(
        ticker=ticker,
        form_type=form_type,
        filing_date=filing_date_str,
        fiscal_year=fiscal_year,
        fiscal_quarter=fiscal_quarter,
        file_path=file_path
    )


# =============================================================================
# MAIN PROCESSING FUNCTION (Universal)
# =============================================================================
def process_filing_robust_universal(file_path: str, target_tokens: int = 500, overlap_tokens: int = 100) -> List[Chunk]:
    """
    Universal processing function for all SEC filings
    """
    try:
        # Extract filing metadata
        filing_metadata = extract_metadata_from_filename(file_path)
        filename = Path(file_path).name # For logging clarity
        file_id = filename.replace(".txt", "") # For chunk_id creation

        # Read and clean content
        with open(file_path, 'r', encoding='utf-8') as f:
            raw_content = f.read()
        cleaned_content = clean_sec_text(raw_content)

        # Basic check for empty content after cleaning
        if not cleaned_content.strip():
            logger.warning(f"Cleaned content for {filename} is empty. No chunks created.")
            return []

        # Use universal section detection
        sections = detect_sections_robust_universal(cleaned_content)
        logger.info(f"Found {len(sections)} sections in {filename}")

        # Process each section
        all_chunks = []
        chunk_counter = 0

        for section in sections:
            # Ensure section.content is not empty before processing
            if not section.content.strip():
                continue # Skip empty sections

            # Extract tables and narrative from this section's content
            tables_in_section, narrative_content_in_section = extract_and_process_tables(section.content)

            # Create section info string using the original create_section_info
            section_info = create_section_info(section, filing_metadata.form_type)

            # Process tables found within this section
            for table in tables_in_section:
                chunk = Chunk(
                    chunk_id=f"{file_id}-chunk-{chunk_counter:04d}",
                    text=table['text'],
                    token_count=table['token_count'],
                    chunk_type='table',
                    section_info=section_info,
                    filing_metadata=filing_metadata,
                    chunk_index=chunk_counter,
                    has_overlap=False
                )
                all_chunks.append(chunk)
                chunk_counter += 1

            # Process narrative content from this section
            if narrative_content_in_section.strip():
                # Use the existing create_overlapping_chunks for narrative
                narrative_sub_chunks = create_overlapping_chunks(
                    narrative_content_in_section, target_tokens, overlap_tokens
                )

                for chunk_data in narrative_sub_chunks:
                    chunk = Chunk(
                        chunk_id=f"{file_id}-chunk-{chunk_counter:04d}",
                        text=chunk_data['text'],
                        token_count=chunk_data['token_count'],
                        chunk_type='narrative',
                        section_info=section_info,
                        filing_metadata=filing_metadata,
                        chunk_index=chunk_counter,
                        has_overlap=chunk_data['has_overlap']
                    )
                    all_chunks.append(chunk)
                    chunk_counter += 1

        logger.info(f"Created {len(all_chunks)} chunks for {filename}")
        return all_chunks

    except Exception as e:
        logger.error(f"Error processing {file_path}: {e}")
        return []

# =============================================================================
# 5. IMPROVED SENTENCE-AWARE CHUNKING
# =============================================================================

def split_into_sentences(text: str) -> List[str]:
    """
    Split text into sentences using multiple heuristics
    """
    # Simple sentence splitting (can be improved with spaCy/NLTK)
    sentences = re.split(r'(?<=[.!?])\s+(?=[A-Z])', text)

    # Clean up sentences
    sentences = [s.strip() for s in sentences if s.strip()]

    return sentences

def create_overlapping_chunks(text: str, target_tokens: int = 500, overlap_tokens: int = 100,
                            min_tokens: int = 50) -> List[Dict[str, Any]]:
    """
    Create semantically aware chunks with overlap
    """
    sentences = split_into_sentences(text)
    chunks = []

    current_chunk_sentences = []
    current_tokens = 0

    for i, sentence in enumerate(sentences):
        sentence_tokens = len(encoding.encode(sentence))

        # If adding this sentence exceeds target, finalize current chunk
        if current_tokens + sentence_tokens > target_tokens and current_chunk_sentences:
            chunk_text = ' '.join(current_chunk_sentences)
            chunks.append({
                'text': chunk_text,
                'token_count': current_tokens,
                'sentence_count': len(current_chunk_sentences),
                'has_overlap': len(chunks) > 0
            })

            # Create overlap: keep last few sentences
            overlap_sentences = []
            current_overlap_tokens = 0 # Renamed variable to avoid conflict with function parameter 'overlap_tokens'

            # Add sentences from the end until we reach overlap target
            # Ensure we don't go past the start of the chunk
            for sent_idx in range(len(current_chunk_sentences) - 1, -1, -1):
                sent = current_chunk_sentences[sent_idx]
                sent_tokens = len(encoding.encode(sent))
                if current_overlap_tokens + sent_tokens <= overlap_tokens:
                    overlap_sentences.insert(0, sent)
                    current_overlap_tokens += sent_tokens
                else:
                    break
            
            # If after trying to create overlap, we still don't have enough tokens for overlap
            # (e.g., first few sentences are very long), just take some minimal content.
            if not overlap_sentences and current_chunk_sentences:
                # Fallback to last sentence if no other overlap possible and current chunk exists
                overlap_sentences = [current_chunk_sentences[-1]]
                current_overlap_tokens = len(encoding.encode(overlap_sentences[0]))


            # Start new chunk with overlap + current sentence
            current_chunk_sentences = overlap_sentences + [sentence]
            current_tokens = current_overlap_tokens + sentence_tokens
        else:
            # Add sentence to current chunk
            current_chunk_sentences.append(sentence)
            current_tokens += sentence_tokens

    # Add final chunk if it has content
    if current_chunk_sentences:
        chunk_text = ' '.join(current_chunk_sentences)
        final_tokens = len(encoding.encode(chunk_text))

        if final_tokens >= min_tokens:
            chunks.append({
                'text': chunk_text,
                'token_count': final_tokens,
                'sentence_count': len(current_chunk_sentences),
                'has_overlap': len(chunks) > 0
            })

    return chunks

# =============================================================================
# 6. TABLE HANDLING
# =============================================================================

def extract_and_process_tables(content: str) -> Tuple[List[Dict], str]:
    """
    Extract tables and return both table chunks and narrative text
    """
    table_pattern = re.compile(r'=== TABLE START ===.*?=== TABLE END ===', re.DOTALL)
    tables = []

    # Find all tables
    for i, match in enumerate(table_pattern.finditer(content)):
        table_content = match.group(0)
        # Clean table markers
        table_text = table_content.replace('=== TABLE START ===', '').replace('=== TABLE END ===', '').strip()

        if table_text:  # Only add non-empty tables
            tables.append({
                'text': table_text,
                'token_count': len(encoding.encode(table_text)),
                'table_index': i,
                'chunk_type': 'table'
            })

    # Remove tables from content to get narrative text
    narrative_content = table_pattern.sub('', content).strip()

    return tables, narrative_content

# =============================================================================
# 8. TESTING AND VALIDATION
# =============================================================================

def validate_chunks(chunks: List[Chunk]) -> Dict[str, Any]:
    """
    Validate the quality of our chunks
    """
    if not chunks:
        return {"error": "No chunks created"}

    token_counts = [chunk.token_count for chunk in chunks]

    stats = {
        "total_chunks": len(chunks),
        "avg_tokens": sum(token_counts) / len(token_counts),
        "min_tokens": min(token_counts),
        "max_tokens": max(token_counts),
        "chunks_with_overlap": sum(1 for chunk in chunks if chunk.has_overlap),
        "table_chunks": sum(1 for chunk in chunks if chunk.chunk_type == 'table'),
        "narrative_chunks": sum(1 for chunk in chunks if chunk.chunk_type == 'narrative'),
        "unique_sections": len(set(chunk.section_info for chunk in chunks))
    }

    return stats

# =============================================================================
# 9. LET'S TEST THIS!
# =============================================================================

print("🚀 SEC Filing Preprocessing Strategy - Ready for Testing!\n")
print("="*60)
print("Key improvements over original approach:\n")
print("✅ Multi-strategy section detection with fallbacks\n")
print("✅ Sentence-aware chunking with overlap\n")
print("✅ Robust error handling and logging\n")
print("✅ Structured data classes for better organization\n")
print("✅ Quality validation and statistics\n")
print("✅ Separate table and narrative processing\n")
print("="*60)


def test_single_file():
    """Test our preprocessing on a single file"""
    # Replace with an actual file path from your processed_filings directory
    test_file = "processed_filings/AAPL/AAPL_10K_2020-10-30.txt"

    if os.path.exists(test_file):
        print(f"🧪 Testing with: {test_file}\n")
        print("="*50)

        # Changed to universal processing function
        chunks = process_filing_robust_universal(test_file)
        stats = validate_chunks(chunks)

        print("📊 Processing Results:\n")
        for key, value in stats.items():
            print(f"  {key}: {value}\n")

        print("\n📝 Sample Chunks:\n")
        for i, chunk in enumerate(chunks[:3]):  # Show first 3 chunks
            print(f"\nChunk {i+1} ({chunk.chunk_type}):\n")
            print(f"  Section: {chunk.section_info}\n")
            print(f"  Tokens: {chunk.token_count}\n")
            print(f"  Text preview: {chunk.text[:200]}...\n")

        return chunks
    else:
        print(f"❌ File not found: {test_file}\n")
        print("Please update the file path to match your data structure\n")
        return []

# Run the test
chunks = test_single_file()

def compare_section_strategies(content: str): # Changed content_sample to content to use full content
    """Compare how different strategies perform"""
    print("🔍 Comparing Section Detection Strategies\n")
    print("="*50)

    # Strategy 1: Robust regex
    sections_1 = detect_sections_strategy_1_improved(content) # Changed content_sample to content
    print(f"Strategy 1 (Regex): {len(sections_1)} sections\n")
    for i, section in enumerate(sections_1[:5]):  # Show first 5
        print(f"  {i+1}. {section.title[:60]}...\n")

    print()

    # Strategy 2: Page-based fallback
    sections_2 = detect_sections_strategy_2(content) # Changed content_sample to content
    print(f"Strategy 2 (Page-based): {len(sections_2)} sections\n")
    for i, section in enumerate(sections_2[:5]):  # Show first 5
        print(f"  {i+1}. {section.title[:60]}...\n")

    return sections_1, sections_2

# Test if we have chunks from previous test
if chunks:
    # Use the first chunk's filing to get the full content
    test_file = chunks[0].filing_metadata.file_path
    with open(test_file, 'r', encoding='utf-8') as f:
        # Load full content for comparison, not just a sample
        full_content_for_comparison = f.read()
    cleaned_content_for_comparison = clean_sec_text(full_content_for_comparison) # Clean it for consistent comparison

    sections_1_comp, sections_2_comp = compare_section_strategies(cleaned_content_for_comparison)


def analyze_chunking_quality(chunks: List[Chunk]):
    """Deep dive into chunk quality"""
    if not chunks:
        print("No chunks to analyze\n")
        return

    print("📊 Chunking Quality Analysis\n")
    print("="*50)

    # Token distribution
    token_counts = [chunk.token_count for chunk in chunks]

    print(f"Token Distribution:\n")
    print(f"  Mean: {sum(token_counts)/len(token_counts):.1f}\n")
    print(f"  Median: {sorted(token_counts)[len(token_counts)//2]}\n")
    print(f"  Min: {min(token_counts)}\n")
    print(f"  Max: {max(token_counts)}\n")

    # Chunk types
    chunk_types = {}
    for chunk in chunks:
        chunk_types[chunk.chunk_type] = chunk_types.get(chunk.chunk_type, 0) + 1

    print(f"\nChunk Types:\n")
    for chunk_type, count in chunk_types.items():
        print(f"  {chunk_type}: {count}\n")

    # Section distribution
    sections_dist = {} # Renamed to avoid conflict with `sections` list
    for chunk in chunks:
        sections_dist[chunk.section_info] = sections_dist.get(chunk.section_info, 0) + 1

    print(f"\nSection Distribution:\n")
    for section, count in sorted(sections_dist.items()):
        print(f"  {section}: {count} chunks\n")

    # Overlap analysis
    overlap_count = sum(1 for chunk in chunks if chunk.has_overlap)
    print(f"\nOverlap Analysis:\n")
    print(f"  Chunks with overlap: {overlap_count}/{len(chunks)} ({overlap_count/len(chunks)*100:.1f}%)\n")

    return {
        'token_stats': {
            'mean': sum(token_counts)/len(token_counts),
            'median': sorted(token_counts)[len(token_counts)//2],
            'min': min(token_counts),
            'max': max(token_counts)
        },
        'chunk_types': chunk_types,
        'sections': sections_dist,
        'overlap_rate': overlap_count/len(chunks)
    }

# Analyze our test chunks
if chunks:
    quality_analysis = analyze_chunking_quality(chunks)


def test_chunking_parameters():
    """Test different parameter combinations"""
    if not chunks:
        print("No test file processed yet\n")
        return

    test_file = chunks[0].filing_metadata.file_path

    print("🔧 Testing Different Chunking Parameters\n")
    print("="*50)

    # Test different parameter combinations
    param_configs = [
        {"target_tokens": 300, "overlap_tokens": 50, "name": "Small chunks, low overlap"},
        {"target_tokens": 500, "overlap_tokens": 100, "name": "Medium chunks, medium overlap"},
        {"target_tokens": 800, "overlap_tokens": 150, "name": "Large chunks, high overlap"},
    ]

    results = {}

    for config in param_configs:
        print(f"\n🧪 Testing: {config['name']}\n")
        # Changed to universal processing function
        test_chunks = process_filing_robust_universal(
            test_file,
            target_tokens=config['target_tokens'],
            overlap_tokens=config['overlap_tokens']
        )

        stats = validate_chunks(test_chunks)
        results[config['name']] = stats

        print(f"  Total chunks: {stats['total_chunks']}\n")
        print(f"  Avg tokens: {stats['avg_tokens']:.1f}\n")
        print(f"  Overlap rate: {stats['chunks_with_overlap']}/{stats['total_chunks']}\n")

    return results

# Test different parameters
param_results = test_chunking_parameters()


def test_error_handling():
    """Test how our system handles various edge cases"""
    print("🛡️ Testing Error Handling\n")
    print("="*50)

    # Test 1: Non-existent file
    print("Test 1: Non-existent file\n")
    # Changed to universal processing function
    fake_chunks = process_filing_robust_universal("non_existent_file.txt")
    print(f"  Result: {len(fake_chunks)} chunks (expected 0)\n")

    # Test 2: Empty file
    print("\nTest 2: Empty content\n")
    empty_sections = detect_sections_robust_universal("") # Changed to universal detection
    print(f"  Result: {len(empty_sections)} sections\n")

    # Test 3: Malformed filename
    print("\nTest 3: Malformed filename\n")
    # Create a temporary file with bad name
    import tempfile
    with tempfile.NamedTemporaryFile(mode='w', suffix='_bad_name.txt', delete=False) as f:
        f.write("Some content")
        temp_file = f.name

    # Changed to universal processing function
    bad_chunks = process_filing_robust_universal(temp_file)
    print(f"  Result: {len(bad_chunks)} chunks (expected 0)\n")

    # Clean up
    os.unlink(temp_file)

    # Test 4: Very short text
    print("\nTest 4: Very short text\n")
    # This call is correct, as create_overlapping_chunks is a helper
    short_chunks = create_overlapping_chunks("Short text.", target_tokens=500)
    print(f"  Result: {len(short_chunks)} chunks\n")

test_error_handling()


def test_batch_processing(max_files: int = 5):
    """Test processing multiple files"""
    print(f"🔄 Testing Batch Processing (max {max_files} files)\n")
    print("="*50)

    data_path = "processed_filings/"
    if not os.path.exists(data_path):
        print(f"❌ Data path not found: {data_path}\n")
        return []

    # Get all files
    all_files = []
    for root, dirs, files in os.walk(data_path):
        for file in files:
            if file.endswith('.txt'):
                all_files.append(os.path.join(root, file))

    # Process a subset
    test_files = all_files[:max_files]
    print(f"Processing {len(test_files)} files...\n")

    all_results = []

    for i, file_path in enumerate(test_files):
        print(f"  {i+1}/{len(test_files)}: {os.path.basename(file_path)}\n")

        # Changed to universal processing function
        file_chunks = process_filing_robust_universal(file_path)
        stats = validate_chunks(file_chunks)

        all_results.append({
            'file': os.path.basename(file_path),
            'chunks': len(file_chunks),
            'avg_tokens': stats.get('avg_tokens', 0),
            'sections': stats.get('unique_sections', 0),
            'tables': stats.get('table_chunks', 0)
        })

    # Summary statistics
    print(f"\n📊 Batch Processing Summary:\n")
    total_chunks = sum(r['chunks'] for r in all_results)
    avg_chunks_per_file = total_chunks / len(all_results) if all_results else 0

    print(f"  Total files processed: {len(all_results)}\n")
    print(f"  Total chunks created: {total_chunks}\n")
    print(f"  Average chunks per file: {avg_chunks_per_file:.1f}\n")

    print(f"\n📋 Per-file results:\n")
    for result in all_results:
        print(f"  {result['file']}: {result['chunks']} chunks, {result['sections']} sections, {result['tables']} tables\n")

    return all_results

# Run batch test
batch_results = test_batch_processing(max_files=3)


def create_analysis_summary():
    """Create a comprehensive summary of our preprocessing"""
    print("📈 Final Analysis Summary\n")
    print("="*60)

    # Assumes 'chunks' variable from test_single_file() is available
    if 'chunks' not in globals() or not chunks:
        print("No chunks to analyze - run test_single_file() first\n")
        return

    # Create a mini dataset for analysis
    chunk_data = []
    for chunk in chunks:
        chunk_data.append({
            'chunk_id': chunk.chunk_id,
            'tokens': chunk.token_count,
            'type': chunk.chunk_type,
            'section': chunk.section_info,
            'has_overlap': chunk.has_overlap,
            'ticker': chunk.filing_metadata.ticker,
            'form_type': chunk.filing_metadata.form_type,
            'fiscal_year': chunk.filing_metadata.fiscal_year
        })

    df = pd.DataFrame(chunk_data)

    print("🎯 Key Insights:\n")
    print(f"  • Document: {df['ticker'].iloc[0]} {df['form_type'].iloc[0]} (FY{df['fiscal_year'].iloc[0]})\n")
    print(f"  • Total chunks: {len(df)}\n")
    print(f"  • Average chunk size: {df['tokens'].mean():.0f} tokens\n")
    print(f"  • Size range: {df['tokens'].min()} - {df['tokens'].max()} tokens\n")
    print(f"  • Overlap rate: {(df['has_overlap'].sum() / len(df) * 100):.1f}%\n")

    print(f"\n📊 Chunk Distribution by Type:\n")
    type_dist = df['type'].value_counts()
    for chunk_type, count in type_dist.items():
        percentage = (count / len(df)) * 100
        print(f"  • {chunk_type}: {count} chunks ({percentage:.1f}%)\n")

    print(f"\n📚 Section Breakdown:\n")
    section_dist = df['section'].value_counts()
    for section, count in section_dist.head(8).items():  # Top 8 sections
        print(f"  • {section}: {count} chunks\n")

    # Quality metrics
    print(f"\n✅ Quality Metrics:\n")

    # Check for very small chunks (potential issues)
    small_chunks = df[df['tokens'] < 50]
    print(f"  • Very small chunks (<50 tokens): {len(small_chunks)} ({len(small_chunks)/len(df)*100:.1f}%)\n")

    # Check for very large chunks (might need splitting)
    large_chunks = df[df['tokens'] > 800]
    print(f"  • Large chunks (>800 tokens): {len(large_chunks)} ({len(large_chunks)/len(df)*100:.1f}%)\n")

    # Check section coverage
    unique_sections = df['section'].nunique()
    print(f"  • Unique sections identified: {unique_sections}\n")

    # Show some example chunks for manual review
    print(f"\n🔍 Sample Chunks for Review:\n")

    # Show one of each type
    for chunk_type in df['type'].unique():
        sample = df[df['type'] == chunk_type].iloc[0]
        # Find the actual chunk object to get its full text
        chunk_obj = next(c for c in chunks if c.chunk_id == sample['chunk_id'])
        print(f"\n  {chunk_type.upper()} example ({sample['tokens']} tokens):\n")
        print(f"    Section: {sample['section']}\n")
        print(f"    Preview: {chunk_obj.text[:150]}...\n")

    return df

# Create final summary
summary_df = create_analysis_summary()


def compare_with_original():
    """Compare our approach with the original chunking strategy"""
    print("⚖️ Comparison: New vs Original Approach\n")
    print("="*60)

    improvements = [
        "✅ Multi-strategy section detection (fallbacks for robustness)",
        "✅ Sentence-aware chunking (preserves semantic boundaries)",
        "✅ Overlapping chunks (maintains context across boundaries)",
        "✅ Separate table processing (handles structured data better)",
        "✅ Comprehensive error handling (graceful degradation)",
        "✅ Rich metadata structure (better for search/filtering)",
        "✅ Quality validation (ensures chunk coherence)",
        "✅ Configurable parameters (tunable for different use cases)"
    ]

    potential_tradeoffs = [
        "⚠️ Slightly more complex code (but more maintainable)",
        "⚠️ More chunks due to overlap (but better retrieval)",
        "⚠️ Processing takes longer (but more robust results)"
    ]

    print("🚀 Key Improvements:\n")
    for improvement in improvements:
        print(f"  {improvement}\n")

    print(f"\n⚖️ Potential Tradeoffs:\n")
    for tradeoff in potential_tradeoffs:
        print(f"  {tradeoff}\n")

    print(f"\n🎯 Recommended Next Steps:\n")
    next_steps = [
        "1. Test on more diverse filings to validate robustness",
        "2. Fine-tune chunking parameters based on embedding performance",
        "3. Add semantic similarity checks between overlapping chunks",
        "4. Implement incremental processing for large datasets",
        "5. Add support for other SEC forms (8-K, DEF 14A, etc.)",
        "6. Create embedding quality metrics and evaluation"
    ]

    for step in next_steps:
        print(f"  {step}\n")

    print("\n" + "="*60)
    print("🎉 Preprocessing Strategy Testing Complete!\n")
    print("="*60)
    print("Next step: Convert this notebook into modular Python files\n")
    print("Then: Implement the embedding pipeline and MCP server!\n")
    print("="*60)

compare_with_original()

# Test functions adapted to _fixed suffix to avoid NameErrors from notebook re-runs
# Ensure these are called after all function definitions.
print("🚀 Ready to test universal SEC detection!\n")
print("\n1. Run test_universal_detection_fixed() to test all files\n")
print("2. Run compare_old_vs_universal_fixed() to see the improvement\n")
print("3. Run quick_pattern_test_fixed() to see what patterns match\n")

# Define the _fixed test functions so they are available when called below
def test_universal_detection_fixed():
    """Test the universal detection on all your file types"""

    test_files = [
        "processed_filings/AAPL/AAPL_10K_2020-10-30.txt",
        "processed_filings/AMZN/AMZN_10K_2023-02-03.txt",
        "processed_filings/AMZN/AMZN_10Q_2024-11-01.txt", # This file name is in the future based on current date
        "processed_filings/KO/KO_10Q_2020-07-22.txt"
    ]

    results = {}

    for test_file in test_files:
        if not os.path.exists(test_file):
            print(f"⚠️ Skipping {test_file} - file not found\n")
            continue

        print(f"\n🧪 Testing: {test_file}\n")
        print("=" * 80)

        with open(test_file, 'r', encoding='utf-8') as f:
            content = f.read()

        # Test universal detection
        sections = detect_sections_robust_universal(content)

        print(f"\n✅ Found {len(sections)} sections:\n")
        for i, section in enumerate(sections[:10]):
            print(f"  {i+1}. {section.title}\n")
            print(f"     Type: {section.section_type}, Length: {len(section.content):,} chars\n")

        # Test full pipeline
        chunks = process_filing_robust_universal(test_file)
        stats = validate_chunks(chunks) if chunks else {"error": "No chunks created"}

        results[test_file] = {
            'sections': len(sections),
            'chunks': len(chunks) if chunks else 0,
            'stats': stats
        }

        print(f"\n📊 Processing Results:\n")
        for key, value in stats.items():
            print(f"  {key}: {value}\n")

        if chunks:
            section_counts = {}
            for chunk in chunks[:20]:
                section = chunk.section_info
                section_counts[section] = section_counts.get(section, 0) + 1

            print(f"\n📚 Section Distribution (sample):\n")
            for section, count in sorted(section_counts.items()):
                print(f"  • {section}: {count} chunks\n")

    print(f"\n" + "="*80)
    print("📊 UNIVERSAL DETECTION SUMMARY\n")
    print("="*80)

    for file_path, result in results.items():
        filename = file_path.split('/')[-1]
        print(f"{filename:<25} | {result['sections']:>2} sections | {result['chunks']:>3} chunks\n")

    return results

def compare_old_vs_universal_fixed():
    """Compare the old detection vs universal detection"""
    test_file = "processed_filings/AAPL/AAPL_10K_2020-10-30.txt"

    if not os.path.exists(test_file):
        print("Test file not found for comparison\n")
        return

    print("⚖️ OLD vs UNIVERSAL Detection Comparison\n")
    print("="*60)

    with open(test_file, 'r', encoding='utf-8') as f:
        content = f.read()

    print("Running old detection...\n")
    old_sections = detect_sections_robust_old(content)

    print("Running universal detection...\n")
    new_sections = detect_sections_robust_universal(content)

    print(f"\n📊 Comparison Results:\n")
    print(f"  Old detection: {len(old_sections)} sections\n")
    print(f"  Universal detection: {len(new_sections)} sections\n")
    print(f"  Improvement: +{len(new_sections) - len(old_sections)} sections\n")

    print(f"\n📋 Old Sections:\n")
    for i, section in enumerate(old_sections):
        print(f"  {i+1}. {section.title}\n")

    print(f"\n📋 Universal Sections:\n")
    for i, section in enumerate(new_sections):
        print(f"  {i+1}. {section.title}\n")

    return old_sections, new_sections

def quick_pattern_test_fixed():
    """Quick test to see what patterns match in your content"""
    test_file = "processed_filings/AAPL/AAPL_10K_2020-10-30.txt"

    if not os.path.exists(test_file):
        print("Test file not found\n")
        return

    print("🔍 QUICK PATTERN TEST\n")
    print("="*50)

    with open(test_file, 'r', encoding='utf-8') as f:
        content = f.read()

    patterns = [
        (re.compile(r'\[TABLE_START\](?:.|\n)*?Item(?:.|\n)*?\[TABLE_END\]', re.I | re.DOTALL), "Table-wrapped Items"),
        (re.compile(r'Item\s+\d+[A-C]?\.\s*\|', re.I), "Pipe-separated Items"),
        (re.compile(r'PART\s+[IVX]+', re.I), "Part headers"),
        (re.compile(r'\[TABLE_START\](?:.|\n)*?PART(?:.|\n)*?\[TABLE_END\]', re.I | re.DOTALL), "Table-wrapped Parts"),
    ]

    for compiled_pattern, description in patterns:
        matches = compiled_pattern.findall(content)
        print(f"\n{description}: {len(matches)} matches\n")
        for i, match in enumerate(matches[:3]):
            clean_match = ' '.join(match.split())[:100]
            print(f"  {i+1}: {clean_match}...\n")

# Run the fixed tests
results_universal = test_universal_detection_fixed()
old_vs_new_sections = compare_old_vs_universal_fixed()
quick_pattern_test_fixed()

INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 0 unique sections:
INFO:__main__:Found table of contents (1367 chars)
INFO:__main__:Extracted 0 sections from table of contents:
INFO:__main__:Found 1 sections in AAPL_10K_2020-10-30.txt
INFO:__main__:Created 172 chunks for AAPL_10K_2020-10-30.txt
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 0 unique sections:
INFO:__main__:Found table of contents (1367 chars)
INFO:__main__:Extracted 0 sections from table of contents:
INFO:__main__:Found 1 sections in AAPL_10K_2020-10-30.txt
INFO:__main__:Created 262 chunks for AAPL_10K_2020-10-30.txt
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 0 unique sections:
INFO:__main__:Found table of contents (1367 chars)
INFO:__main__:Extracted 0 sections from table of contents:
INFO:__main__:Found 1 sections in AAPL_10K_2020-10-30.txt


🚀 SEC Filing Preprocessing Strategy - Ready for Testing!

Key improvements over original approach:

✅ Multi-strategy section detection with fallbacks

✅ Sentence-aware chunking with overlap

✅ Robust error handling and logging

✅ Structured data classes for better organization

✅ Quality validation and statistics

✅ Separate table and narrative processing

🧪 Testing with: processed_filings/AAPL/AAPL_10K_2020-10-30.txt

📊 Processing Results:

  total_chunks: 172

  avg_tokens: 379.86046511627904

  min_tokens: 38

  max_tokens: 1692

  chunks_with_overlap: 105

  table_chunks: 66

  narrative_chunks: 106

  unique_sections: 1


📝 Sample Chunks:


Chunk 1 (table):

  Section: Full Document

  Tokens: 58

  Text preview: California | 94-2404110 | (State or other jurisdiction | of incorporation or organization) | (I.R.S. Employer Identification No.) | One Apple Park Way | Cupertino | , | California | 95014 | (Address o...


Chunk 2 (table):

  Section: Full Document

  Tokens: 240

  Text 

INFO:__main__:Created 172 chunks for AAPL_10K_2020-10-30.txt
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 0 unique sections:
INFO:__main__:Found table of contents (1367 chars)
INFO:__main__:Extracted 0 sections from table of contents:
INFO:__main__:Found 1 sections in AAPL_10K_2020-10-30.txt
INFO:__main__:Created 127 chunks for AAPL_10K_2020-10-30.txt
ERROR:__main__:Error processing non_existent_file.txt: Unknown datetime string format, unable to parse: file, at position 0
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:Empty content provided to detect_sections_universal_sec. Returning empty sections.
INFO:__main__:Empty content provided to detect_sections_from_toc_universal. Returning empty sections.
ERROR:__main__:Error processing /var/folders/pj/bmp5122d3d77bzq_cvf0wbl40000gn/T/tmppv1xe7oi_bad_name.txt: Unknown datetime string format, unable to parse: name, at position 0
INFO:__main__:Attempting univers

  Total chunks: 172

  Avg tokens: 379.9

  Overlap rate: 105/172


🧪 Testing: Large chunks, high overlap

  Total chunks: 127

  Avg tokens: 495.8

  Overlap rate: 60/127

🛡️ Testing Error Handling

Test 1: Non-existent file

  Result: 0 chunks (expected 0)


Test 2: Empty content

  Result: 1 sections


Test 3: Malformed filename

  Result: 0 chunks (expected 0)


Test 4: Very short text

  Result: 0 chunks

🔄 Testing Batch Processing (max 3 files)

Processing 3 files...

  1/3: AMZN_10Q_2022-04-29.txt

  2/3: AMZN_10Q_2020-05-01.txt



INFO:__main__:Created 195 chunks for AMZN_10Q_2020-05-01.txt
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 0 unique sections:
INFO:__main__:Found table of contents (901 chars)
INFO:__main__:Extracted 9 sections from table of contents:
INFO:__main__:  • 1: ...
INFO:__main__:  • 2: ...
INFO:__main__:  • 3: ...
INFO:__main__:  • 4: ...
INFO:__main__:  • 1A: ...
INFO:__main__:  • 5: ...
INFO:__main__:  • 6: ...
INFO:__main__:  • I: . FINANCIAL INFORMATION | Item 1. | Financial Stat...
INFO:__main__:  • II: . OTHER INFORMATION | Item 1. | Legal Proceedings ...
INFO:__main__:TOC analysis found 9 potential sections. Attempting to extract content based on TOC titles.
INFO:__main__:Found 1 sections in AMZN_10Q_2020-10-30.txt
INFO:__main__:Created 120 chunks for AMZN_10Q_2020-10-30.txt
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 17 unique sections:
INFO:__main__:  1: Item/Part I - 

  3/3: AMZN_10Q_2020-10-30.txt


📊 Batch Processing Summary:

  Total files processed: 3

  Total chunks created: 440

  Average chunks per file: 146.7


📋 Per-file results:

  AMZN_10Q_2022-04-29.txt: 125 chunks, 1 sections, 51 tables

  AMZN_10Q_2020-05-01.txt: 195 chunks, 1 sections, 131 tables

  AMZN_10Q_2020-10-30.txt: 120 chunks, 1 sections, 48 tables

📈 Final Analysis Summary

🎯 Key Insights:

  • Document: AAPL 10K (FY2020)

  • Total chunks: 172

  • Average chunk size: 380 tokens

  • Size range: 38 - 1692 tokens

  • Overlap rate: 61.0%


📊 Chunk Distribution by Type:

  • narrative: 106 chunks (61.6%)

  • table: 66 chunks (38.4%)


📚 Section Breakdown:

  • Full Document: 172 chunks


✅ Quality Metrics:

  • Very small chunks (<50 tokens): 2 (1.2%)

  • Large chunks (>800 tokens): 3 (1.7%)

  • Unique sections identified: 1


🔍 Sample Chunks for Review:


  TABLE example (58 tokens):

    Section: Full Document

    Preview: California | 94-2404110 | (State or other juris

INFO:__main__:  2: Item/Part 1A - Risk Factors...
INFO:__main__:  3: Item/Part 1B - Unresolved Staff Comments...
INFO:__main__:  4: Item/Part 3 - Legal Proceedings...
INFO:__main__:  5: Item/Part 4 - Mine Safety Disclosures...
INFO:__main__:  6: Item/Part 6 - Selected Financial Data...
INFO:__main__:  7: Item/Part 7 - Management’s Discussion and Analysis of Financial Condition ...
INFO:__main__:  8: Item/Part 7A - Quantitative and Qualitative Disclosures About Market Risk...
INFO:__main__:  9: Item/Part 8 - Financial Statements and Supplementary Data...
INFO:__main__:  10: Item/Part 9 - Changes in and Disagreements with Accountants on Accounting ...
INFO:__main__:  11: Item/Part 9B - Other Information...
INFO:__main__:  12: Item/Part 11 - Executive Compensation...
INFO:__main__:  13: Item/Part 12 - Security Ownership of Certain Beneficial Owners and Manageme...
INFO:__main__:  14: Item/Part 13 - Certain Relationships and Related Transactions, and Director...
INFO:__main__:  15: Item/Pa


✅ Found 17 sections:

  1. Item 1.    Business

     Type: part, Length: 13,274 chars

  2. Risk Factors

     Type: item, Length: 61,136 chars

  3. Unresolved Staff Comments

     Type: item, Length: 582 chars

  4. Legal Proceedings

     Type: item, Length: 898 chars

  5. Mine Safety Disclosures

     Type: item, Length: 4,292 chars

  6. Selected Financial Data

     Type: item, Length: 1,745 chars

  7. Management’s Discussion and Analysis of Financial Condition and Results of Operations

     Type: item, Length: 33,154 chars

  8. Quantitative and Qualitative Disclosures About Market Risk

     Type: item, Length: 6,799 chars

  9. Financial Statements and Supplementary Data

     Type: item, Length: 103,042 chars

  10. Changes in and Disagreements with Accountants on Accounting and Financial Disclosure

     Type: item, Length: 4,635 chars


📊 Processing Results:

  total_chunks: 172

  avg_tokens: 379.86046511627904

  min_tokens: 38

  max_tokens: 1692

  chunks_with_overl

INFO:__main__:  1: Item/Part I - [TABLE_START]...
INFO:__main__:  2: Item/Part II - [TABLE_START]...
INFO:__main__:  3: Item/Part III - [TABLE_START]...
INFO:__main__:  4: Item/Part IV - [TABLE_START]...
INFO:__main__:Universal detection successful (Strategy 1): Found 4 sections.
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 0 unique sections:
INFO:__main__:Found table of contents (1441 chars)
INFO:__main__:Extracted 30 sections from table of contents:
INFO:__main__:  • I: Item 1....
INFO:__main__:  • II: Item 5....
INFO:__main__:  • III: Item 10....
INFO:__main__:  • IV: Item 15....
INFO:__main__:  • 1: ...
INFO:__main__:  • 1A: ...
INFO:__main__:  • 1B: ...
INFO:__main__:  • 2: ...
INFO:__main__:  • 3: ...
INFO:__main__:  • 4: ...
INFO:__main__:TOC analysis found 30 potential sections. Attempting to extract content based on TOC titles.
INFO:__main__:Found 1 sections in AMZN_10K_2023-02-03.txt
INFO:__main__:Created 210 chunks fo


✅ Found 4 sections:

  1. [TABLE_START]

     Type: part, Length: 71,104 chars

  2. [TABLE_START]

     Type: part, Length: 189,316 chars

  3. [TABLE_START]

     Type: part, Length: 2,224 chars

  4. [TABLE_START]

     Type: part, Length: 10,492 chars


📊 Processing Results:

  total_chunks: 210

  avg_tokens: 332.1666666666667

  min_tokens: 6

  max_tokens: 1157

  chunks_with_overlap: 119

  table_chunks: 90

  narrative_chunks: 120

  unique_sections: 1


📚 Section Distribution (sample):

  • Full Document: 20 chunks


🧪 Testing: processed_filings/AMZN/AMZN_10Q_2024-11-01.txt



INFO:__main__:  1: Item/Part I - . FINANCIAL INFORMATION...
INFO:__main__:  2: Item/Part II - . OTHER INFORMATION...
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 0 unique sections:
INFO:__main__:Found table of contents (903 chars)
INFO:__main__:Extracted 9 sections from table of contents:
INFO:__main__:  • 1: ...
INFO:__main__:  • 2: ...
INFO:__main__:  • 3: ...
INFO:__main__:  • 4: ...
INFO:__main__:  • 1A: ...
INFO:__main__:  • 5: ...
INFO:__main__:  • 6: ...
INFO:__main__:  • I: . FINANCIAL INFORMATION | Item 1. | Financial Stat...
INFO:__main__:  • II: . OTHER INFORMATION | Item 1. | Legal Proceedings ...
INFO:__main__:TOC analysis found 9 potential sections. Attempting to extract content based on TOC titles.
INFO:__main__:Found 1 sections in AMZN_10Q_2024-11-01.txt
INFO:__main__:Created 132 chunks for AMZN_10Q_2024-11-01.txt
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection fou


✅ Found 1 sections:

  1. Full Document

     Type: document, Length: 187,951 chars


📊 Processing Results:

  total_chunks: 132

  avg_tokens: 366.43939393939394

  min_tokens: 7

  max_tokens: 1548

  chunks_with_overlap: 81

  table_chunks: 50

  narrative_chunks: 82

  unique_sections: 1


📚 Section Distribution (sample):

  • Full Document: 20 chunks


🧪 Testing: processed_filings/KO/KO_10Q_2020-07-22.txt



INFO:__main__:  1: Item/Part I - . Financial Information...
INFO:__main__:  2: Item/Part 2 - Management's Discussion and Analysis of Financial Condition ...
INFO:__main__:  3: Item/Part 3 - Quantitative and Qualitative Disclosures About Market Risk...
INFO:__main__:  4: Item/Part 4 - Controls and Procedures...
INFO:__main__:  5: Item/Part II - . Other Information...
INFO:__main__:  6: Item/Part 1A - Risk Factors...
INFO:__main__:  7: Item/Part 2 - Unregistered Sales of Equity Securities and Use of Proceeds...
INFO:__main__:  8: Item/Part 6 - Exhibits...
INFO:__main__:Universal detection successful (Strategy 1): Found 8 sections.
INFO:__main__:Attempting universal SEC section detection
INFO:__main__:🔍 Universal SEC detection found 0 unique sections:
INFO:__main__:Found table of contents (5004 chars)
INFO:__main__:Extracted 0 sections from table of contents:
INFO:__main__:Found 1 sections in KO_10Q_2020-07-22.txt
INFO:__main__:Created 161 chunks for KO_10Q_2020-07-22.txt
INFO:__main__:At


✅ Found 8 sections:

  1. . Financial Information

     Type: part, Length: 115,924 chars

  2. Management's Discussion and Analysis of Financial Condition and Results of Operations

     Type: item, Length: 87,923 chars

  3. Quantitative and Qualitative Disclosures About Market Risk

     Type: item, Length: 207 chars

  4. Controls and Procedures

     Type: item, Length: 1,004 chars

  5. . Other Information

     Type: part, Length: 248 chars

  6. Risk Factors

     Type: item, Length: 11,661 chars

  7. Unregistered Sales of Equity Securities and Use of Proceeds

     Type: item, Length: 2,127 chars

  8. Exhibits

     Type: item, Length: 13,918 chars


📊 Processing Results:

  total_chunks: 161

  avg_tokens: 396.7577639751553

  min_tokens: 32

  max_tokens: 1451

  chunks_with_overlap: 97

  table_chunks: 63

  narrative_chunks: 98

  unique_sections: 1


📚 Section Distribution (sample):

  • Full Document: 20 chunks


📊 UNIVERSAL DETECTION SUMMARY

AAPL_10K_2020-10-30.txt 

INFO:__main__:  1: Item/Part I - Item 1.    Business...
INFO:__main__:  2: Item/Part 1A - Risk Factors...
INFO:__main__:  3: Item/Part 1B - Unresolved Staff Comments...
INFO:__main__:  4: Item/Part 3 - Legal Proceedings...
INFO:__main__:  5: Item/Part 4 - Mine Safety Disclosures...
INFO:__main__:  6: Item/Part 6 - Selected Financial Data...
INFO:__main__:  7: Item/Part 7 - Management’s Discussion and Analysis of Financial Condition ...
INFO:__main__:  8: Item/Part 7A - Quantitative and Qualitative Disclosures About Market Risk...
INFO:__main__:  9: Item/Part 8 - Financial Statements and Supplementary Data...
INFO:__main__:  10: Item/Part 9 - Changes in and Disagreements with Accountants on Accounting ...
INFO:__main__:  11: Item/Part 9B - Other Information...
INFO:__main__:  12: Item/Part 11 - Executive Compensation...
INFO:__main__:  13: Item/Part 12 - Security Ownership of Certain Beneficial Owners and Manageme...
INFO:__main__:  14: Item/Part 13 - Certain Relationships and Related T


📊 Comparison Results:

  Old detection: 19 sections

  Universal detection: 17 sections

  Improvement: +-2 sections


📋 Old Sections:

  1. Part I

  2. Item 1A

  3. Item 1B

  4. Item 3

  5. Item 4

  6. Item 6

  7. Item 7

  8. Item 7A

  9. Item 8

  10. Notes to Consolidated Financial Statements

  11. Opinion on the Financial Statements

  12. Item 9

  13. Item 9B

  14. Item 11

  15. Item 12

  16. Item 13

  17. Item 14

  18. Part IV

  19. Item 16


📋 Universal Sections:

  1. Item 1.    Business

  2. Risk Factors

  3. Unresolved Staff Comments

  4. Legal Proceedings

  5. Mine Safety Disclosures

  6. Selected Financial Data

  7. Management’s Discussion and Analysis of Financial Condition and Results of Operations

  8. Quantitative and Qualitative Disclosures About Market Risk

  9. Financial Statements and Supplementary Data

  10. Changes in and Disagreements with Accountants on Accounting and Financial Disclosure

  11. Other Information

  12. Executive Compen