In [None]:
# Azure Document Intelligence + Structure-Aware Chunker Test
# Complete Colab notebook for testing document processing and chunking

import os
import json
import tempfile
from typing import List, Dict, Any, Optional, Union, Tuple
from pathlib import Path
from dataclasses import dataclass, asdict
import re
import pandas as pd
from IPython.display import display, HTML, Markdown
import ipywidgets as widgets
from google.colab import files
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import time

# Install required packages
def install_requirements():
    """Install all required packages"""
    packages = [
        "langchain",
        "langchain-community",
        "langchain-openai",
        "langchain-experimental",
        "langchain-text-splitters",
        "azure-ai-formrecognizer",
        "azure-core",
        "pymupdf4llm",
        "pandas",
        "ipywidgets",
        "matplotlib",
        "seaborn",
        "wordcloud",
        "plotly"
    ]

    for package in packages:
        try:
            __import__(package.replace('-', '_'))
            print(f"‚úì {package} already installed")
        except ImportError:
            print(f"Installing {package}...")
            os.system(f"pip install {package}")

# Uncomment the line below to install packages
# install_requirements()

# Azure Document Processor Class
class AzureDocumentProcessor:
    def __init__(
        self,
        api_endpoint: Optional[str] = None,
        api_key: Optional[str] = None,
        api_model: str = "prebuilt-layout",
        mode: str = "markdown",
        analysis_features: Optional[List[str]] = None
    ):
        self.api_endpoint = api_endpoint
        self.api_key = api_key
        self.api_model = api_model
        self.mode = mode
        self.analysis_features = analysis_features or []

        if not self.api_endpoint or not self.api_key:
            raise ValueError("Azure Document Intelligence endpoint and key are required")

    def process_file(self, file_path: str) -> List[Dict]:
        """Process file and return documents"""
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"File not found: {file_path}")

        try:
            from langchain_community.document_loaders import AzureAIDocumentIntelligenceLoader

            loader = AzureAIDocumentIntelligenceLoader(
                api_endpoint=self.api_endpoint,
                api_key=self.api_key,
                file_path=file_path,
                api_model=self.api_model,
                mode=self.mode,
                analysis_features=self.analysis_features
            )

            docs = loader.load()
            return [{"page_content": doc.page_content, "metadata": doc.metadata} for doc in docs]

        except ImportError as e:
            print(f"Error: Required Azure packages not installed: {e}")
            return []
        except Exception as e:
            print(f"Error processing document: {e}")
            return []

    def process_url(self, url: str) -> List[Dict]:
        """Process URL and return documents"""
        try:
            from langchain_community.document_loaders import AzureAIDocumentIntelligenceLoader

            loader = AzureAIDocumentIntelligenceLoader(
                api_endpoint=self.api_endpoint,
                api_key=self.api_key,
                url_path=url,
                api_model=self.api_model,
                mode=self.mode,
                analysis_features=self.analysis_features
            )

            docs = loader.load()
            return [{"page_content": doc.page_content, "metadata": doc.metadata} for doc in docs]

        except Exception as e:
            print(f"Error processing URL: {e}")
            return []

    def process_bytes(self, file_content: bytes, filename: str = "document") -> List[Dict]:
        """Process file bytes and return documents"""
        with tempfile.NamedTemporaryFile(delete=False, suffix=f"_{filename}") as temp_file:
            temp_file.write(file_content)
            temp_file_path = temp_file.name

        try:
            return self.process_file(temp_file_path)
        finally:
            os.unlink(temp_file_path)

    def extract_text_and_metadata(self, documents: List[Dict]) -> Dict[str, Any]:
        """Extract combined text and metadata from documents"""
        if not documents:
            return {"text": "", "metadata": {}, "page_count": 0}

        combined_text = "\n\n".join([doc["page_content"] for doc in documents])
        combined_metadata = {}

        for doc in documents:
            for key, value in doc.get("metadata", {}).items():
                try:
                    json.dumps(value)
                    combined_metadata[key] = value
                except (TypeError, ValueError):
                    continue

        return {
            "text": combined_text,
            "metadata": combined_metadata,
            "page_count": len(documents) if self.mode == "page" else 1
        }

    def get_supported_formats(self) -> List[str]:
        """Get list of supported file formats"""
        return [
            "pdf", "jpeg", "jpg", "png", "bmp", "tiff", "heif",
            "docx", "xlsx", "pptx", "html"
        ]

    def is_supported_format(self, file_path: str) -> bool:
        """Check if file format is supported"""
        extension = Path(file_path).suffix.lower().lstrip('.')
        return extension in self.get_supported_formats()

# Structure-Aware Chunker Classes
@dataclass
class ChunkMetadata:
    """Enhanced metadata for insurance document chunks"""
    chunk_id: str
    chunk_type: str  # 'header_section', 'table', 'semantic', 'mixed'
    importance_score: float
    section_hierarchy: List[str]  # ['3.', '3.2', '3.2.1']
    has_tables: bool
    table_count: int
    has_monetary_values: bool
    monetary_amounts: List[str]
    has_exclusions: bool
    exclusion_phrases: List[str]
    policy_terms: List[str]
    cross_references: List[str]
    original_headers: List[str]
    chunk_position: int
    source_lines: Tuple[int, int]  # (start_line, end_line)

class MockEmbeddings:
    """Mock embeddings class for testing when OpenAI is not available"""
    def embed_documents(self, texts):
        # Return dummy embeddings
        return [[0.1] * 100 for _ in texts]

    def embed_query(self, text):
        return [0.1] * 100

class StructureAwareChunker:
    """
    Simplified version of the insurance document chunker for testing
    """

    def __init__(
        self,
        chunk_size: int = 1500,
        chunk_overlap: int = 200,
        min_chunk_size: int = 300,
        use_mock_embeddings: bool = True
    ):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.min_chunk_size = min_chunk_size

        # Use mock embeddings for testing
        self.embeddings = MockEmbeddings() if use_mock_embeddings else None

        # Compile patterns
        self._compile_insurance_patterns()

        # Setup splitters
        self._setup_splitters()

    def _compile_insurance_patterns(self):
        """Compile patterns specific to insurance documents"""

        # Table patterns
        self.table_patterns = [
            re.compile(r'<table[^>]*>.*?</table>', re.DOTALL | re.IGNORECASE),
            re.compile(r'\|[^|]+\|([^|]+\|)+', re.MULTILINE),  # Markdown tables
            re.compile(r'\+[-=]+\+.*?\+[-=]+\+', re.DOTALL),   # ASCII tables
        ]

        # Monetary patterns
        self.monetary_patterns = [
            re.compile(r'‚Çπ\s*[\d,]+(?:\.\d{2})?(?:\s*(?:lakh|crore|thousand)s?)?'),
            re.compile(r'Rs\.?\s*[\d,]+(?:\.\d{2})?(?:\s*(?:lakh|crore|thousand)s?)?'),
            re.compile(r'INR\s*[\d,]+(?:\.\d{2})?'),
            re.compile(r'[\d,]+\s*(?:lakh|crore|thousand)s?', re.IGNORECASE),
        ]

        # Exclusion patterns
        self.exclusion_patterns = [
            re.compile(r'\b(?:not covered|excluded|limitation|restriction)\b', re.IGNORECASE),
            re.compile(r'\b(?:provided that|except|however|subject to|notwithstanding)\b', re.IGNORECASE),
        ]

        # Policy terms patterns
        self.policy_terms_patterns = [
            re.compile(r'\b(?:sum insured|premium|deductible|copay|coverage)\b', re.IGNORECASE),
            re.compile(r'\b(?:proportionate deduction|room rent|icu charges|daycare)\b', re.IGNORECASE),
        ]

        # Cross-reference patterns
        self.cross_ref_patterns = [
            re.compile(r'(?:section|clause)\s*[\d.]+(?:\.\d+)*', re.IGNORECASE),
        ]

    def _setup_splitters(self):
        """Setup text splitters"""
        try:
            from langchain.text_splitter import MarkdownHeaderTextSplitter
            from langchain_text_splitters import RecursiveCharacterTextSplitter

            # Header-based splitter
            headers_to_split_on = [
                ("#", "Header 1"),
                ("##", "Header 2"),
                ("###", "Header 3"),
                ("####", "Header 4"),
            ]

            self.header_splitter = MarkdownHeaderTextSplitter(
                headers_to_split_on=headers_to_split_on,
                strip_headers=False
            )

            # Recursive splitter as fallback
            self.recursive_splitter = RecursiveCharacterTextSplitter(
                chunk_size=self.chunk_size,
                chunk_overlap=self.chunk_overlap,
                length_function=len,
                separators=["\n\n", "\n", ". ", " ", ""]
            )

        except ImportError as e:
            print(f"Warning: LangChain not available: {e}")
            self.header_splitter = None
            self.recursive_splitter = None

    def chunk_with_metadata(self, text: str) -> List[Tuple[str, ChunkMetadata]]:
        """
        Advanced chunking with metadata
        """

        # Step 1: Preprocess text
        preprocessed_text = self._preprocess_text(text)

        # Step 2: Extract tables
        tables, text_without_tables = self._extract_tables(preprocessed_text)

        # Step 3: Split by headers or fallback
        if self.header_splitter:
            chunks = self._split_by_headers(text_without_tables)
        else:
            chunks = self._simple_split(text_without_tables)

        # Step 4: Process tables
        table_chunks = self._process_tables(tables)

        # Step 5: Combine and finalize
        all_chunks = chunks + table_chunks
        final_chunks = self._create_final_chunks(all_chunks)

        return final_chunks

    def _preprocess_text(self, text: str) -> str:
        """Clean up text from Azure OCR"""

        # Convert HTML tags to markdown
        text = re.sub(r'<strong[^>]*>(.*?)</strong>', r'**\1**', text, flags=re.IGNORECASE)
        text = re.sub(r'<b[^>]*>(.*?)</b>', r'**\1**', text, flags=re.IGNORECASE)
        text = re.sub(r'<em[^>]*>(.*?)</em>', r'*\1*', text, flags=re.IGNORECASE)
        text = re.sub(r'<i[^>]*>(.*?)</i>', r'*\1*', text, flags=re.IGNORECASE)

        # Clean up common OCR artifacts
        text = re.sub(r'</?p[^>]*>', '', text)
        text = re.sub(r'</?div[^>]*>', '', text)
        text = re.sub(r'<br[^>]*/?>', '\n', text)

        # Normalize whitespace
        text = re.sub(r'\n\s*\n\s*\n', '\n\n', text)
        text = re.sub(r'[ \t]+', ' ', text)

        # Convert numbered sections to headers
        text = re.sub(r'^(\d+\.\s+[A-Z\s]+):?\s*$', r'## \1', text, flags=re.MULTILINE)
        text = re.sub(r'^(\d+\.\d+\s+[A-Z\s]+):?\s*$', r'### \1', text, flags=re.MULTILINE)

        return text

    def _extract_tables(self, text: str) -> Tuple[List[Dict], str]:
        """Extract tables and replace with placeholders"""
        tables = []
        text_without_tables = text

        for pattern in self.table_patterns:
            matches = list(pattern.finditer(text_without_tables))

            for match in reversed(matches):
                table_text = match.group(0)
                start_pos = match.start()
                end_pos = match.end()

                table_info = {
                    'id': f'table_{len(tables)}',
                    'text': table_text,
                    'start_pos': start_pos,
                    'end_pos': end_pos,
                    'importance_score': self._calculate_table_importance(table_text),
                }

                tables.append(table_info)

                # Replace with placeholder
                placeholder = f"\n\n[TABLE_{table_info['id']}_PLACEHOLDER]\n\n"
                text_without_tables = (text_without_tables[:start_pos] +
                                     placeholder +
                                     text_without_tables[end_pos:])

        return tables, text_without_tables

    def _calculate_table_importance(self, table_text: str) -> float:
        """Calculate importance score for tables"""
        score = 5.0  # Base score for being a table

        # Monetary values increase importance
        monetary_matches = sum(len(p.findall(table_text)) for p in self.monetary_patterns)
        score += monetary_matches * 3.0

        # Policy terms add importance
        policy_matches = sum(len(p.findall(table_text)) for p in self.policy_terms_patterns)
        score += policy_matches * 2.0

        return score

    def _split_by_headers(self, text: str) -> List[Dict]:
        """Split text using header splitter"""

        try:
            header_splits = self.header_splitter.split_text(text)

            chunks = []
            for i, doc in enumerate(header_splits):
                chunk_info = {
                    'text': doc.page_content,
                    'metadata': doc.metadata,
                    'chunk_type': 'header_section',
                    'position': i,
                    'importance_score': self._calculate_section_importance(doc.page_content),
                    'headers': list(doc.metadata.values()) if doc.metadata else []
                }
                chunks.append(chunk_info)

            return chunks

        except Exception as e:
            print(f"Header splitting failed: {e}. Using simple split.")
            return self._simple_split(text)

    def _simple_split(self, text: str) -> List[Dict]:
        """Simple fallback splitting"""

        if self.recursive_splitter:
            try:
                splits = self.recursive_splitter.split_text(text)
                chunks = []
                for i, chunk_text in enumerate(splits):
                    chunk_info = {
                        'text': chunk_text,
                        'metadata': {},
                        'chunk_type': 'simple_split',
                        'position': i,
                        'importance_score': self._calculate_section_importance(chunk_text),
                        'headers': []
                    }
                    chunks.append(chunk_info)
                return chunks
            except:
                pass

        # Ultimate fallback - basic splitting
        paragraphs = text.split('\n\n')
        chunks = []
        current_chunk = ""
        chunk_num = 0

        for para in paragraphs:
            if len(current_chunk) + len(para) > self.chunk_size and current_chunk:
                chunk_info = {
                    'text': current_chunk.strip(),
                    'metadata': {},
                    'chunk_type': 'basic_split',
                    'position': chunk_num,
                    'importance_score': self._calculate_section_importance(current_chunk),
                    'headers': []
                }
                chunks.append(chunk_info)
                current_chunk = para
                chunk_num += 1
            else:
                current_chunk += "\n\n" + para if current_chunk else para

        if current_chunk.strip():
            chunk_info = {
                'text': current_chunk.strip(),
                'metadata': {},
                'chunk_type': 'basic_split',
                'position': chunk_num,
                'importance_score': self._calculate_section_importance(current_chunk),
                'headers': []
            }
            chunks.append(chunk_info)

        return chunks

    def _process_tables(self, tables: List[Dict]) -> List[Dict]:
        """Process table chunks"""
        table_chunks = []

        for table_info in tables:
            chunk_info = {
                'text': table_info['text'],
                'metadata': {'table_id': table_info['id']},
                'chunk_type': 'table',
                'position': f"table_{table_info['id']}",
                'importance_score': table_info['importance_score'],
                'headers': []
            }
            table_chunks.append(chunk_info)

        return table_chunks

    def _calculate_section_importance(self, text: str) -> float:
        """Calculate importance score for text sections"""

        score = 1.0  # Base score

        # Monetary values
        monetary_count = sum(len(p.findall(text)) for p in self.monetary_patterns)
        score += monetary_count * 2.0

        # Policy terms
        policy_count = sum(len(p.findall(text)) for p in self.policy_terms_patterns)
        score += policy_count * 1.5

        # Exclusions
        exclusion_count = sum(len(p.findall(text)) for p in self.exclusion_patterns)
        score += exclusion_count * 3.0

        return score

    def _create_final_chunks(self, all_chunks: List[Dict]) -> List[Tuple[str, ChunkMetadata]]:
        """Create final chunks with metadata"""

        final_chunks = []

        for i, chunk_info in enumerate(all_chunks):
            chunk_text = chunk_info['text']

            # Skip tiny chunks unless they're tables
            if len(chunk_text.strip()) < self.min_chunk_size and chunk_info['chunk_type'] != 'table':
                continue

            # Extract detailed metadata
            monetary_amounts = []
            for pattern in self.monetary_patterns:
                monetary_amounts.extend(pattern.findall(chunk_text))

            exclusion_phrases = []
            for pattern in self.exclusion_patterns:
                exclusion_phrases.extend(pattern.findall(chunk_text))

            policy_terms = []
            for pattern in self.policy_terms_patterns:
                policy_terms.extend(pattern.findall(chunk_text))

            cross_references = []
            for pattern in self.cross_ref_patterns:
                cross_references.extend(pattern.findall(chunk_text))

            # Create metadata
            metadata = ChunkMetadata(
                chunk_id=f"chunk_{i}",
                chunk_type=chunk_info['chunk_type'],
                importance_score=chunk_info['importance_score'],
                section_hierarchy=chunk_info.get('headers', []),
                has_tables='[TABLE_' in chunk_text,
                table_count=chunk_text.count('[TABLE_'),
                has_monetary_values=len(monetary_amounts) > 0,
                monetary_amounts=monetary_amounts[:10],
                has_exclusions=len(exclusion_phrases) > 0,
                exclusion_phrases=exclusion_phrases[:5],
                policy_terms=policy_terms[:10],
                cross_references=cross_references[:5],
                original_headers=chunk_info.get('headers', []),
                chunk_position=i,
                source_lines=(0, 0)
            )

            final_chunks.append((chunk_text, metadata))

        return final_chunks

# Test and Visualization Functions
class DocumentProcessorTester:
    """Test harness for the document processor and chunker"""

    def __init__(self):
        self.processor = None
        self.chunker = StructureAwareChunker()
        self.last_processed_text = ""
        self.last_chunks = []
        self.processing_stats = {}

    def setup_azure_credentials(self):
        """Interactive setup for Azure credentials"""

        print("üîß Azure Document Intelligence Setup")
        print("=" * 50)

        # Create input widgets
        endpoint_widget = widgets.Text(
            placeholder="https://your-resource.cognitiveservices.azure.com/",
            description="Endpoint:",
            style={'description_width': 'initial'},
            layout={'width': '500px'}
        )

        key_widget = widgets.Password(
            placeholder="Your API key",
            description="API Key:",
            style={'description_width': 'initial'},
            layout={'width': '500px'}
        )

        model_widget = widgets.Dropdown(
            options=['prebuilt-layout', 'prebuilt-document', 'prebuilt-read'],
            value='prebuilt-layout',
            description="Model:",
            style={'description_width': 'initial'}
        )

        mode_widget = widgets.Dropdown(
            options=['markdown', 'single', 'page'],
            value='markdown',
            description="Mode:",
            style={'description_width': 'initial'}
        )

        setup_button = widgets.Button(
            description="Setup Processor",
            button_style='success',
            layout={'width': '150px'}
        )

        output_widget = widgets.Output()

        def on_setup_click(b):
            with output_widget:
                output_widget.clear_output()
                try:
                    self.processor = AzureDocumentProcessor(
                        api_endpoint=endpoint_widget.value.strip(),
                        api_key=key_widget.value.strip(),
                        api_model=model_widget.value,
                        mode=mode_widget.value
                    )
                    print("‚úÖ Azure Document Processor setup complete!")
                    print(f"   Endpoint: {endpoint_widget.value.strip()}")
                    print(f"   Model: {model_widget.value}")
                    print(f"   Mode: {mode_widget.value}")
                except Exception as e:
                    print(f"‚ùå Setup failed: {e}")

        setup_button.on_click(on_setup_click)

        # Display widgets
        display(widgets.VBox([
            widgets.HTML("<h3>Enter your Azure Document Intelligence credentials:</h3>"),
            endpoint_widget,
            key_widget,
            model_widget,
            mode_widget,
            setup_button,
            output_widget
        ]))

    def upload_and_process_document(self):
        """Upload and process a document"""

        if not self.processor:
            print("‚ùå Please setup Azure credentials first!")
            return

        print("\nüìÅ Upload Document")
        print("=" * 30)

        # File upload
        upload_button = widgets.FileUpload(
            accept='.pdf,.docx,.jpg,.jpeg,.png',
            multiple=False,
            description="Choose File"
        )

        process_button = widgets.Button(
            description="Process Document",
            button_style='primary',
            layout={'width': '150px'}
        )

        output_widget = widgets.Output()

        def on_process_click(b):
            with output_widget:
                output_widget.clear_output()

                if not upload_button.value:
                    print("‚ùå Please select a file first!")
                    return

                try:
                    # Get uploaded file
                    file_info = list(upload_button.value.values())[0]
                    filename = file_info['metadata']['name']
                    file_content = file_info['content']

                    print(f"üìÑ Processing: {filename}")
                    print("   This may take a few moments...")

                    start_time = time.time()

                    # Process with Azure
                    documents = self.processor.process_bytes(file_content, filename)

                    if documents:
                        # Extract text
                        result = self.processor.extract_text_and_metadata(documents)
                        self.last_processed_text = result['text']

                        processing_time = time.time() - start_time

                        print(f"‚úÖ Document processed successfully!")
                        print(f"   Pages: {result['page_count']}")
                        print(f"   Total characters: {len(self.last_processed_text):,}")
                        print(f"   Processing time: {processing_time:.2f} seconds")

                        # Store stats
                        self.processing_stats = {
                            'filename': filename,
                            'pages': result['page_count'],
                            'characters': len(self.last_processed_text),
                            'processing_time': processing_time,
                            'metadata': result['metadata']
                        }

                        # Show preview
                        preview = self.last_processed_text[:1000]
                        print(f"\nüìù Text Preview (first 1000 chars):")
                        print("-" * 50)
                        print(preview)
                        if len(self.last_processed_text) > 1000:
                            print("...")

                        # Auto-chunk the document
                        print("\nüîß Auto-chunking document...")
                        self.chunk_current_text()

                    else:
                        print("‚ùå No content extracted from document")

                except Exception as e:
                    print(f"‚ùå Processing failed: {e}")

        process_button.on_click(on_process_click)

        # Display widgets
        display(widgets.VBox([
            upload_button,
            process_button,
            output_widget
        ]))

    def test_with_sample_text(self):
        """Test with sample insurance document text"""

        sample_text = """
# HEALTH INSURANCE POLICY DOCUMENT

## 1. POLICY OVERVIEW

This comprehensive health insurance policy provides coverage for medical expenses up to ‚Çπ5,00,000 per policy year.

### 1.1 Sum Insured
The sum insured under this policy is ‚Çπ5,00,000 (Rupees Five Lakhs Only) per policy year.

### 1.2 Premium
Annual premium: ‚Çπ12,500 (including GST)

## 2. BENEFITS COVERED

### 2.1 Hospitalization Benefits

<table>
<tr><th>Benefit Type</th><th>Coverage Limit</th><th>Conditions</th></tr>
<tr><td>Room & Boarding</td><td>‚Çπ3,000 per day</td><td>Maximum 365 days</td></tr>
<tr><td>ICU Charges</td><td>‚Çπ6,000 per day</td><td>Maximum 30 days</td></tr>
<tr><td>Surgeon Fees</td><td>Up to Sum Insured</td><td>As per policy terms</td></tr>
</table>

### 2.2 Pre & Post Hospitalization

- **Pre-hospitalization**: Up to 30 days, maximum ‚Çπ25,000
- **Post-hospitalization**: Up to 60 days, maximum ‚Çπ50,000

### 2.3 Proportionate Deduction Clause

**Important**: If room rent exceeds ‚Çπ3,000 per day, proportionate deduction will apply to:
- Surgeon fees
- OT charges
- Medicine costs
- Consumables

The deduction percentage will be calculated as: (Room Rent Claimed - Room Rent Limit) / Room Rent Claimed √ó 100

## 3. EXCLUSIONS

### 3.1 Permanent Exclusions

The following are **not covered** under this policy:

- Pre-existing diseases for first 2 years
- Cosmetic surgery (except post-accident)
- Dental treatment (except due to accident)
- Alternative treatments (Ayush)

### 3.2 Waiting Periods

| Condition | Waiting Period |
|-----------|----------------|
| Pre-existing diseases | 24 months |
| Specific diseases | 12 months |
| Maternity benefits | 36 months |

## 4. CLAIM PROCEDURES

### 4.1 Cashless Claims

For cashless treatment, contact our 24x7 helpline: **1800-XXX-XXXX**

### 4.2 Reimbursement Claims

Submit the following documents within 30 days:
1. Claim form (duly filled)
2. Original bills and receipts
3. Discharge summary
4. Investigation reports

**Maximum claim processing time**: 30 days from receipt of all documents.

## 5. SPECIAL FEATURES

### 5.1 No Claim Bonus

Earn 5% bonus on sum insured for each claim-free year, up to maximum 50%.

### 5.2 Automatic Restoration

Sum insured gets automatically restored if exhausted due to any one illness.

## 6. CONTACT INFORMATION

- **Customer Care**: 1800-XXX-XXXX
- **Email**: support@insurance.com
- **Website**: www.insurance.com

---

*This is a simplified version for testing purposes. Please refer to the complete policy document for detailed terms and conditions.*
"""

        print("üß™ Testing with Sample Insurance Document")
        print("=" * 50)

        self.last_processed_text = sample_text

        print(f"üìÑ Sample document loaded ({len(sample_text):,} characters)")
        print("\nüìù Document Preview:")
        print("-" * 30)
        print(sample_text[:500] + "...")

        # Chunk the sample text
        print("\nüîß Chunking document...")
        self.chunk_current_text()

    def chunk_current_text(self):
        """Chunk the current processed text"""

        if not self.last_processed_text:
            print("‚ùå No text to chunk! Process a document first.")
            return

        try:
            start_time = time.time()

            # Perform chunking
            self.last_chunks = self.chunker.chunk_with_metadata(self.last_processed_text)

            chunking_time = time.time() - start_time

            print(f"‚úÖ Text chunked successfully!")
            print(f"   Total chunks: {len(self.last_chunks)}")
            print(f"   Chunking time: {chunking_time:.2f} seconds")

            # Show chunk statistics
            self.show_chunk_statistics()

        except Exception as e:
            print(f"‚ùå Chunking failed: {e}")

    def show_chunk_statistics(self):
        """Display statistics about the chunks"""

        if not self.last_chunks:
            print("‚ùå No chunks available!")
            return

        print("\nüìä Chunk Statistics")
        print("=" * 30)

        # Basic stats
        chunk_sizes = [len(chunk[0]) for chunk in self.last_chunks]
        chunk_types = [chunk[1].chunk_type for chunk in self.last_chunks]
        importance_scores = [chunk[1].importance_score for chunk in self.last_chunks]

        print(f"Total chunks: {len(self.last_chunks)}")
        print(f"Average chunk size: {sum(chunk_sizes) / len(chunk_sizes):.0f} characters")
        print(f"Min chunk size: {min(chunk_sizes)} characters")
        print(f"Max chunk size: {max(chunk_sizes)} characters")
        print(f"Average importance score: {sum(importance_scores) / len(importance_scores):.2f}")

        # Chunk types distribution
        type_counts = Counter(chunk_types)
        print(f"\nChunk types:")
        for chunk_type, count in type_counts.items():
            print(f"  {chunk_type}: {count}")

        # Special features
        tables_count = sum(1 for chunk in self.last_chunks if chunk[1].has_tables)
        monetary_count = sum(1 for chunk in self.last_chunks if chunk[1].has_monetary_values)
        exclusions_count = sum(1 for chunk in self.last_chunks if chunk[1].has_exclusions)

        print(f"\nSpecial features:")
        print(f"  Chunks with tables: {tables_count}")
        print(f"  Chunks with monetary values: {monetary_count}")
        print(f"  Chunks with exclusions: {exclusions_count}")

    def visualize_chunks(self):
        """Create visualizations of the chunks"""

        if not self.last_chunks:
            print("‚ùå No chunks to visualize! Process and chunk a document first.")
            return

        print("üìä Creating Chunk Visualizations")
        print("=" * 40)

        # Prepare data
        chunk_data = []
        for i, (text, metadata) in enumerate(self.last_chunks):
            chunk_data.append({
                'chunk_id': metadata.chunk_id,
                'chunk_type': metadata.chunk_type,
                'size': len(text),
                'importance_score': metadata.importance_score,
                'has_tables': metadata.has_tables,
                'has_monetary_values': metadata.has_monetary_values,
                'has_exclusions': metadata.has_exclusions,
                'table_count': metadata.table_count,
                'monetary_count': len(metadata.monetary_amounts),
                'exclusion_count': len(metadata.exclusion_phrases),
                'policy_terms_count': len(metadata.policy_terms)
            })

        df = pd.DataFrame(chunk_data)

        # Create subplots
        fig, axes = plt.subplots(2, 3, figsize=(18, 12))
        fig.suptitle('Document Chunk Analysis', fontsize=16)

        # 1. Chunk sizes distribution
        axes[0, 0].hist(df['size'], bins=20, alpha=0.7, color='skyblue')
        axes[0, 0].set_title('Chunk Size Distribution')
        axes[0, 0].set_xlabel('Characters')
        axes[0, 0].set_ylabel('Frequency')

        # 2. Chunk types
        type_counts = df['chunk_type'].value_counts()
        axes[0, 1].pie(type_counts.values, labels=type_counts.index, autopct='%1.1f%%')
        axes[0, 1].set_title('Chunk Types Distribution')

        # 3. Importance scores
        axes[0, 2].scatter(range(len(df)), df['importance_score'], alpha=0.7, c=df['importance_score'], cmap='viridis')
        axes[0, 2].set_title('Importance Scores by Position')
        axes[0, 2].set_xlabel('Chunk Position')
        axes[0, 2].set_ylabel('Importance Score')

        # 4. Special features heatmap
        features = ['has_tables', 'has_monetary_values', 'has_exclusions']
        feature_data = df[features].astype(int)
        sns.heatmap(feature_data.T, ax=axes[1, 0], cmap='YlOrRd', cbar=True)
        axes[1, 0].set_title('Special Features by Chunk')
        axes[1, 0].set_xlabel('Chunk Index')

        # 5. Content density
        axes[1, 1].bar(range(len(df)), df['monetary_count'], alpha=0.7, label='Monetary Values')
        axes[1, 1].bar(range(len(df)), df['policy_terms_count'], alpha=0.7, label='Policy Terms', bottom=df['monetary_count'])
        axes[1, 1].set_title('Content Density')
        axes[1, 1].set_xlabel('Chunk Index')
        axes[1, 1].set_ylabel('Count')
        axes[1, 1].legend()

        # 6. Size vs Importance correlation
        axes[1, 2].scatter(df['size'], df['importance_score'], alpha=0.7, c=df.index, cmap='plasma')
        axes[1, 2].set_title('Size vs Importance Score')
        axes[1, 2].set_xlabel('Chunk Size (characters)')
        axes[1, 2].set_ylabel('Importance Score')

        plt.tight_layout()
        plt.show()

        # Display summary table
        print("\nüìã Chunk Summary Table")
        print("=" * 50)

        summary_df = df[['chunk_id', 'chunk_type', 'size', 'importance_score',
                        'has_tables', 'has_monetary_values', 'has_exclusions']].head(10)
        display(summary_df)

    def interactive_chunk_explorer(self):
        """Create an interactive widget to explore chunks"""

        if not self.last_chunks:
            print("‚ùå No chunks to explore! Process and chunk a document first.")
            return

        print("üîç Interactive Chunk Explorer")
        print("=" * 40)

        # Create chunk selector
        chunk_options = [(f"Chunk {i}: {chunk[1].chunk_type} (Score: {chunk[1].importance_score:.1f})", i)
                        for i, chunk in enumerate(self.last_chunks)]

        chunk_selector = widgets.Dropdown(
            options=chunk_options,
            description="Select Chunk:",
            style={'description_width': 'initial'},
            layout={'width': '400px'}
        )

        # Filter options
        filter_tables = widgets.Checkbox(value=False, description="Has Tables")
        filter_monetary = widgets.Checkbox(value=False, description="Has Monetary Values")
        filter_exclusions = widgets.Checkbox(value=False, description="Has Exclusions")

        # Content display
        output_widget = widgets.Output()

        def update_chunk_display():
            with output_widget:
                output_widget.clear_output()

                # Get current chunk
                chunk_idx = chunk_selector.value
                if chunk_idx is None:
                    return

                text, metadata = self.last_chunks[chunk_idx]

                # Apply filters
                if filter_tables.value and not metadata.has_tables:
                    return
                if filter_monetary.value and not metadata.has_monetary_values:
                    return
                if filter_exclusions.value and not metadata.has_exclusions:
                    return

                # Display chunk info
                print(f"üîç Chunk {chunk_idx}: {metadata.chunk_id}")
                print("=" * 50)
                print(f"Type: {metadata.chunk_type}")
                print(f"Size: {len(text)} characters")
                print(f"Importance Score: {metadata.importance_score:.2f}")
                print(f"Position: {metadata.chunk_position}")

                # Special features
                features = []
                if metadata.has_tables:
                    features.append(f"üìä {metadata.table_count} table(s)")
                if metadata.has_monetary_values:
                    features.append(f"üí∞ {len(metadata.monetary_amounts)} monetary value(s)")
                if metadata.has_exclusions:
                    features.append(f"‚ö†Ô∏è {len(metadata.exclusion_phrases)} exclusion(s)")

                if features:
                    print(f"Features: {', '.join(features)}")

                # Headers
                if metadata.original_headers:
                    print(f"Headers: {', '.join(metadata.original_headers)}")

                # Show extracted values
                if metadata.monetary_amounts:
                    print(f"\nüí∞ Monetary Values: {', '.join(metadata.monetary_amounts[:5])}")

                if metadata.policy_terms:
                    print(f"\nüìã Policy Terms: {', '.join(metadata.policy_terms[:5])}")

                if metadata.exclusion_phrases:
                    print(f"\n‚ö†Ô∏è Exclusions: {', '.join(metadata.exclusion_phrases[:3])}")

                # Show text content
                print(f"\nüìù Content:")
                print("-" * 50)
                print(text[:1000])
                if len(text) > 1000:
                    print("...")

        # Wire up interactions
        chunk_selector.observe(lambda change: update_chunk_display(), names='value')
        filter_tables.observe(lambda change: update_chunk_display(), names='value')
        filter_monetary.observe(lambda change: update_chunk_display(), names='value')
        filter_exclusions.observe(lambda change: update_chunk_display(), names='value')

        # Display widgets
        filters_box = widgets.HBox([filter_tables, filter_monetary, filter_exclusions])
        display(widgets.VBox([
            chunk_selector,
            widgets.HTML("<b>Filters:</b>"),
            filters_box,
            output_widget
        ]))

        # Initial display
        update_chunk_display()

    def export_chunks(self):
        """Export chunks to various formats"""

        if not self.last_chunks:
            print("‚ùå No chunks to export! Process and chunk a document first.")
            return

        print("üíæ Export Chunks")
        print("=" * 20)

        # Export format selector
        format_selector = widgets.Dropdown(
            options=[
                ('JSON', 'json'),
                ('CSV', 'csv'),
                ('Excel', 'excel'),
                ('Plain Text', 'txt')
            ],
            value='json',
            description="Format:"
        )

        export_button = widgets.Button(
            description="Export",
            button_style='success'
        )

        output_widget = widgets.Output()

        def on_export_click(b):
            with output_widget:
                output_widget.clear_output()

                try:
                    format_type = format_selector.value

                    if format_type == 'json':
                        # Export as JSON
                        export_data = []
                        for i, (text, metadata) in enumerate(self.last_chunks):
                            export_data.append({
                                'chunk_id': metadata.chunk_id,
                                'text': text,
                                'metadata': asdict(metadata)
                            })

                        filename = "chunks_export.json"
                        with open(filename, 'w', encoding='utf-8') as f:
                            json.dump(export_data, f, indent=2, ensure_ascii=False)

                    elif format_type == 'csv':
                        # Export as CSV
                        export_data = []
                        for i, (text, metadata) in enumerate(self.last_chunks):
                            export_data.append({
                                'chunk_id': metadata.chunk_id,
                                'chunk_type': metadata.chunk_type,
                                'size': len(text),
                                'importance_score': metadata.importance_score,
                                'has_tables': metadata.has_tables,
                                'has_monetary_values': metadata.has_monetary_values,
                                'has_exclusions': metadata.has_exclusions,
                                'monetary_amounts': ', '.join(metadata.monetary_amounts),
                                'policy_terms': ', '.join(metadata.policy_terms),
                                'text': text[:500] + '...' if len(text) > 500 else text
                            })

                        df = pd.DataFrame(export_data)
                        filename = "chunks_export.csv"
                        df.to_csv(filename, index=False, encoding='utf-8')

                    elif format_type == 'excel':
                        # Export as Excel
                        export_data = []
                        for i, (text, metadata) in enumerate(self.last_chunks):
                            export_data.append({
                                'chunk_id': metadata.chunk_id,
                                'chunk_type': metadata.chunk_type,
                                'size': len(text),
                                'importance_score': metadata.importance_score,
                                'has_tables': metadata.has_tables,
                                'has_monetary_values': metadata.has_monetary_values,
                                'has_exclusions': metadata.has_exclusions,
                                'monetary_amounts': ', '.join(metadata.monetary_amounts),
                                'policy_terms': ', '.join(metadata.policy_terms),
                                'text': text
                            })

                        df = pd.DataFrame(export_data)
                        filename = "chunks_export.xlsx"
                        df.to_excel(filename, index=False)

                    elif format_type == 'txt':
                        # Export as plain text
                        filename = "chunks_export.txt"
                        with open(filename, 'w', encoding='utf-8') as f:
                            for i, (text, metadata) in enumerate(self.last_chunks):
                                f.write(f"=== Chunk {i}: {metadata.chunk_id} ===\n")
                                f.write(f"Type: {metadata.chunk_type}\n")
                                f.write(f"Importance Score: {metadata.importance_score}\n")
                                f.write(f"Size: {len(text)} characters\n")
                                f.write("Content:\n")
                                f.write(text)
                                f.write("\n\n" + "="*50 + "\n\n")

                    print(f"‚úÖ Exported {len(self.last_chunks)} chunks to {filename}")

                    # Trigger download
                    files.download(filename)

                except Exception as e:
                    print(f"‚ùå Export failed: {e}")

        export_button.on_click(on_export_click)

        # Display widgets
        display(widgets.HBox([format_selector, export_button]))
        display(output_widget)

    def configure_chunker(self):
        """Interactive chunker configuration"""

        print("‚öôÔ∏è Chunker Configuration")
        print("=" * 30)

        # Configuration widgets
        chunk_size_widget = widgets.IntSlider(
            value=self.chunker.chunk_size,
            min=500,
            max=3000,
            step=100,
            description="Chunk Size:",
            style={'description_width': 'initial'}
        )

        overlap_widget = widgets.IntSlider(
            value=self.chunker.chunk_overlap,
            min=50,
            max=500,
            step=25,
            description="Overlap:",
            style={'description_width': 'initial'}
        )

        min_size_widget = widgets.IntSlider(
            value=self.chunker.min_chunk_size,
            min=100,
            max=1000,
            step=50,
            description="Min Size:",
            style={'description_width': 'initial'}
        )

        apply_button = widgets.Button(
            description="Apply Configuration",
            button_style='primary'
        )

        rechunk_button = widgets.Button(
            description="Re-chunk Current Text",
            button_style='success'
        )

        output_widget = widgets.Output()

        def on_apply_click(b):
            with output_widget:
                output_widget.clear_output()

                # Update chunker configuration
                self.chunker = StructureAwareChunker(
                    chunk_size=chunk_size_widget.value,
                    chunk_overlap=overlap_widget.value,
                    min_chunk_size=min_size_widget.value
                )

                print("‚úÖ Chunker configuration updated!")
                print(f"   Chunk Size: {chunk_size_widget.value}")
                print(f"   Overlap: {overlap_widget.value}")
                print(f"   Min Size: {min_size_widget.value}")

        def on_rechunk_click(b):
            with output_widget:
                output_widget.clear_output()

                if not self.last_processed_text:
                    print("‚ùå No text available to re-chunk!")
                    return

                print("üîß Re-chunking with new configuration...")
                self.chunk_current_text()

        apply_button.on_click(on_apply_click)
        rechunk_button.on_click(on_rechunk_click)

        # Display widgets
        config_box = widgets.VBox([
            chunk_size_widget,
            overlap_widget,
            min_size_widget,
            widgets.HBox([apply_button, rechunk_button])
        ])

        display(config_box)
        display(output_widget)

    def run_performance_test(self):
        """Run performance tests with different configurations"""

        if not self.last_processed_text:
            print("‚ùå No text available for performance testing!")
            return

        print("‚ö° Performance Testing")
        print("=" * 30)

        # Test configurations
        test_configs = [
            {'chunk_size': 1000, 'chunk_overlap': 100, 'min_chunk_size': 200},
            {'chunk_size': 1500, 'chunk_overlap': 200, 'min_chunk_size': 300},
            {'chunk_size': 2000, 'chunk_overlap': 300, 'min_chunk_size': 400},
            {'chunk_size': 2500, 'chunk_overlap': 400, 'min_chunk_size': 500},
        ]

        results = []

        print("Running performance tests...")

        for i, config in enumerate(test_configs):
            print(f"Test {i+1}/4: Chunk size {config['chunk_size']}")

            # Create chunker with test config
            test_chunker = StructureAwareChunker(**config)

            # Measure performance
            start_time = time.time()
            chunks = test_chunker.chunk_with_metadata(self.last_processed_text)
            end_time = time.time()

            # Calculate metrics
            chunk_sizes = [len(chunk[0]) for chunk in chunks]
            importance_scores = [chunk[1].importance_score for chunk in chunks]

            results.append({
                'config': config,
                'chunks_count': len(chunks),
                'processing_time': end_time - start_time,
                'avg_chunk_size': sum(chunk_sizes) / len(chunk_sizes) if chunk_sizes else 0,
                'min_chunk_size': min(chunk_sizes) if chunk_sizes else 0,
                'max_chunk_size': max(chunk_sizes) if chunk_sizes else 0,
                'avg_importance': sum(importance_scores) / len(importance_scores) if importance_scores else 0,
                'chunks_with_tables': sum(1 for chunk in chunks if chunk[1].has_tables),
                'chunks_with_monetary': sum(1 for chunk in chunks if chunk[1].has_monetary_values),
            })

        # Display results
        print("\nüìä Performance Test Results")
        print("=" * 50)

        results_df = pd.DataFrame([{
            'Chunk Size': r['config']['chunk_size'],
            'Overlap': r['config']['chunk_overlap'],
            'Chunks': r['chunks_count'],
            'Time (s)': f"{r['processing_time']:.2f}",
            'Avg Size': f"{r['avg_chunk_size']:.0f}",
            'Avg Importance': f"{r['avg_importance']:.2f}",
            'Tables': r['chunks_with_tables'],
            'Monetary': r['chunks_with_monetary']
        } for r in results])

        display(results_df)

        # Plot results
        fig, axes = plt.subplots(2, 2, figsize=(12, 8))
        fig.suptitle('Performance Test Results', fontsize=14)

        chunk_sizes = [r['config']['chunk_size'] for r in results]

        axes[0, 0].plot(chunk_sizes, [r['chunks_count'] for r in results], 'bo-')
        axes[0, 0].set_title('Number of Chunks')
        axes[0, 0].set_xlabel('Chunk Size')
        axes[0, 0].set_ylabel('Count')

        axes[0, 1].plot(chunk_sizes, [r['processing_time'] for r in results], 'ro-')
        axes[0, 1].set_title('Processing Time')
        axes[0, 1].set_xlabel('Chunk Size')
        axes[0, 1].set_ylabel('Seconds')

        axes[1, 0].plot(chunk_sizes, [r['avg_chunk_size'] for r in results], 'go-')
        axes[1, 0].set_title('Average Chunk Size')
        axes[1, 0].set_xlabel('Chunk Size')
        axes[1, 0].set_ylabel('Characters')

        axes[1, 1].plot(chunk_sizes, [r['avg_importance'] for r in results], 'mo-')
        axes[1, 1].set_title('Average Importance Score')
        axes[1, 1].set_xlabel('Chunk Size')
        axes[1, 1].set_ylabel('Score')

        plt.tight_layout()
        plt.show()

# Main Test Interface
class TestInterface:
    """Main interface for testing the document processor and chunker"""

    def __init__(self):
        self.tester = DocumentProcessorTester()

    def show_main_menu(self):
        """Display the main testing interface"""

        print("üöÄ Azure Document Intelligence + Structure-Aware Chunker Test Suite")
        print("=" * 70)
        print("This notebook allows you to test and experiment with:")
        print("‚Ä¢ Azure Document Intelligence for document processing")
        print("‚Ä¢ Structure-aware chunking for insurance documents")
        print("‚Ä¢ Visualization and analysis of chunking results")
        print("=" * 70)

        # Create menu buttons
        buttons = []

        # Setup section
        setup_button = widgets.Button(
            description="1. Setup Azure Credentials",
            button_style='info',
            layout={'width': '250px', 'margin': '5px'}
        )

        # Document processing
        upload_button = widgets.Button(
            description="2. Upload & Process Document",
            button_style='primary',
            layout={'width': '250px', 'margin': '5px'}
        )

        sample_button = widgets.Button(
            description="3. Test with Sample Text",
            button_style='success',
            layout={'width': '250px', 'margin': '5px'}
        )

        # Analysis and visualization
        stats_button = widgets.Button(
            description="4. Show Chunk Statistics",
            button_style='warning',
            layout={'width': '250px', 'margin': '5px'}
        )

        viz_button = widgets.Button(
            description="5. Visualize Chunks",
            button_style='warning',
            layout={'width': '250px', 'margin': '5px'}
        )

        explore_button = widgets.Button(
            description="6. Interactive Chunk Explorer",
            button_style='warning',
            layout={'width': '250px', 'margin': '5px'}
        )

        # Configuration and testing
        config_button = widgets.Button(
            description="7. Configure Chunker",
            button_style='info',
            layout={'width': '250px', 'margin': '5px'}
        )

        perf_button = widgets.Button(
            description="8. Performance Testing",
            button_style='info',
            layout={'width': '250px', 'margin': '5px'}
        )

        # Export
        export_button = widgets.Button(
            description="9. Export Chunks",
            button_style='danger',
            layout={'width': '250px', 'margin': '5px'}
        )

        # Wire up button callbacks
        setup_button.on_click(lambda b: self.tester.setup_azure_credentials())
        upload_button.on_click(lambda b: self.tester.upload_and_process_document())
        sample_button.on_click(lambda b: self.tester.test_with_sample_text())
        stats_button.on_click(lambda b: self.tester.show_chunk_statistics())
        viz_button.on_click(lambda b: self.tester.visualize_chunks())
        explore_button.on_click(lambda b: self.tester.interactive_chunk_explorer())
        config_button.on_click(lambda b: self.tester.configure_chunker())
        perf_button.on_click(lambda b: self.tester.run_performance_test())
        export_button.on_click(lambda b: self.tester.export_chunks())

        # Organize buttons into sections
        setup_section = widgets.VBox([
            widgets.HTML("<h3>üîß Setup</h3>"),
            setup_button
        ])

        processing_section = widgets.VBox([
            widgets.HTML("<h3>üìÑ Document Processing</h3>"),
            upload_button,
            sample_button
        ])

        analysis_section = widgets.VBox([
            widgets.HTML("<h3>üìä Analysis & Visualization</h3>"),
            stats_button,
            viz_button,
            explore_button
        ])

        config_section = widgets.VBox([
            widgets.HTML("<h3>‚öôÔ∏è Configuration & Testing</h3>"),
            config_button,
            perf_button
        ])

        export_section = widgets.VBox([
            widgets.HTML("<h3>üíæ Export</h3>"),
            export_button
        ])

        # Display sections in a grid
        left_column = widgets.VBox([setup_section, processing_section])
        middle_column = widgets.VBox([analysis_section])
        right_column = widgets.VBox([config_section, export_section])

        display(widgets.HBox([left_column, middle_column, right_column]))

        # Usage instructions
        instructions = """
        <div style="background-color: #f0f0f0; padding: 15px; margin-top: 20px; border-radius: 5px;">
        <h3>üìö Quick Start Guide:</h3>
        <ol>
        <li><b>Setup Azure Credentials:</b> Enter your Azure Document Intelligence endpoint and API key</li>
        <li><b>Process Document:</b> Either upload your own document or test with the sample insurance text</li>
        <li><b>Analyze Results:</b> Use the visualization and exploration tools to understand the chunking results</li>
        <li><b>Experiment:</b> Try different chunker configurations and compare performance</li>
        <li><b>Export:</b> Save your results in various formats for further analysis</li>
        </ol>

        <h3>üí° Tips:</h3>
        <ul>
        <li>Start with the sample text if you don't have Azure credentials yet</li>
        <li>Use the interactive explorer to examine individual chunks in detail</li>
        <li>Try different chunker configurations to see how they affect the results</li>
        <li>The performance testing feature helps you find optimal settings</li>
        </ul>
        </div>
        """

        display(widgets.HTML(instructions))

# Initialize and run the test interface
if __name__ == "__main__":
    interface = TestInterface()
    interface.show_main_menu()

In [None]:
!pip install langchain langchain-community langchain-openai langchain-experimental azure-ai-documentintelligence

In [None]:
import os
import re
from typing import List, Dict, Tuple, Optional, Any, Union
from dataclasses import dataclass, asdict
import json
import tempfile
from pathlib import Path

# Install required packages first
# !pip install langchain langchain-community langchain-openai langchain-experimental azure-ai-documentintelligence

from langchain.text_splitter import MarkdownHeaderTextSplitter
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_experimental.text_splitter import SemanticChunker
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import AzureAIDocumentIntelligenceLoader

@dataclass
class ChunkMetadata:
    """Enhanced metadata for insurance document chunks"""
    chunk_id: str
    chunk_type: str
    importance_score: float
    section_hierarchy: List[str]
    has_tables: bool
    table_count: int
    has_monetary_values: bool
    monetary_amounts: List[str]
    has_exclusions: bool
    exclusion_phrases: List[str]
    policy_terms: List[str]
    cross_references: List[str]
    original_headers: List[str]
    chunk_position: int
    source_lines: Tuple[int, int]

class Settings:
    """Configuration settings"""
    def __init__(self, azure_endpoint, azure_key, openai_key):
        self.AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT = azure_endpoint
        self.AZURE_DOCUMENT_INTELLIGENCE_KEY = azure_key
        self.OPENAI_API_KEY = openai_key
        self.EMBEDDING_MODEL = "text-embedding-ada-002"

class AzureDocumentProcessor:
    def __init__(
        self,
        api_endpoint: str,
        api_key: str,
        api_model: str = "prebuilt-layout",
        mode: str = "markdown",
        analysis_features: Optional[List[str]] = None
    ):
        self.api_endpoint = api_endpoint
        self.api_key = api_key
        self.api_model = api_model
        self.mode = mode
        self.analysis_features = analysis_features or []

        if not self.api_endpoint or not self.api_key:
            raise ValueError("Azure Document Intelligence endpoint and key are required")

    def process_file(self, file_path: str) -> List[Document]:
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"File not found: {file_path}")

        loader = AzureAIDocumentIntelligenceLoader(
            api_endpoint=self.api_endpoint,
            api_key=self.api_key,
            file_path=file_path,
            api_model=self.api_model,
            mode=self.mode,
            analysis_features=self.analysis_features
        )

        return loader.load()

    def extract_text_and_metadata(self, documents: List[Document]) -> Dict[str, Any]:
        if not documents:
            return {"text": "", "metadata": {}, "page_count": 0}

        combined_text = "\n\n".join([doc.page_content for doc in documents])
        combined_metadata = {}

        for doc in documents:
            serializable_metadata = {}
            for key, value in doc.metadata.items():
                try:
                    json.dumps(value)
                    serializable_metadata[key] = value
                except (TypeError, ValueError):
                    continue
            combined_metadata.update(serializable_metadata)

        return {
            "text": combined_text,
            "metadata": combined_metadata,
            "page_count": len(documents) if self.mode == "page" else 1
        }

class InsuranceDocumentChunker:
    def __init__(
        self,
        chunk_size: int = 1500,
        chunk_overlap: int = 200,
        min_chunk_size: int = 300,
        use_semantic_chunker: bool = True,
        embeddings_model: Optional[Any] = None,
        settings: Optional[Settings] = None
    ):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.min_chunk_size = min_chunk_size
        self.use_semantic_chunker = use_semantic_chunker
        self.settings = settings

        # Initialize embeddings for semantic chunker
        self.embeddings = embeddings_model or self._get_default_embeddings()

        # Azure OCR specific patterns for insurance documents
        self._compile_insurance_patterns()

        # Initialize LangChain splitters
        self._setup_splitters()

    def _get_default_embeddings(self):
        """Get default embeddings with proper API key configuration"""
        try:
            if not self.settings or not self.settings.OPENAI_API_KEY:
                print("Warning: OPENAI_API_KEY not found. Semantic chunking disabled.")
                return None

            return OpenAIEmbeddings(
                model=self.settings.EMBEDDING_MODEL,
                openai_api_key=self.settings.OPENAI_API_KEY
            )
        except Exception as e:
            print(f"Warning: Failed to initialize OpenAI embeddings: {e}. Semantic chunking disabled.")
            return None

    def _compile_insurance_patterns(self):
        """Compile patterns specific to insurance documents and Azure OCR output"""

        # Azure OCR often outputs XML-like tags
        self.table_patterns = [
            re.compile(r'<table[^>]*>.*?</table>', re.DOTALL | re.IGNORECASE),
            re.compile(r'\|[^|]+\|([^|]+\|)+', re.MULTILINE),  # Markdown tables
            re.compile(r'\+[-=]+\+.*?\+[-=]+\+', re.DOTALL),   # ASCII tables
            re.compile(r'<tr[^>]*>.*?</tr>', re.DOTALL | re.IGNORECASE),  # Table rows
        ]

        # Bold/emphasis patterns from Azure OCR
        self.emphasis_patterns = [
            re.compile(r'<strong[^>]*>(.*?)</strong>', re.IGNORECASE),
            re.compile(r'<b[^>]*>(.*?)</b>', re.IGNORECASE),
            re.compile(r'\*\*(.*?)\*\*'),
            re.compile(r'__(.*?)__'),
        ]

        # Insurance-specific monetary patterns
        self.monetary_patterns = [
            re.compile(r'‚Çπ\s*[\d,]+(?:\.\d{2})?(?:\s*(?:lakh|crore|thousand)s?)?'),
            re.compile(r'Rs\.?\s*[\d,]+(?:\.\d{2})?(?:\s*(?:lakh|crore|thousand)s?)?'),
            re.compile(r'INR\s*[\d,]+(?:\.\d{2})?'),
            re.compile(r'[\d,]+\s*(?:lakh|crore|thousand)s?', re.IGNORECASE),
            re.compile(r'\$\s*[\d,]+(?:\.\d{2})?'),  # USD
        ]

        # Insurance exclusion patterns
        self.exclusion_patterns = [
            re.compile(r'\b(?:not covered|excluded|limitation|restriction)\b', re.IGNORECASE),
            re.compile(r'\b(?:provided that|except|however|subject to|notwithstanding)\b', re.IGNORECASE),
            re.compile(r'\b(?:shall not|does not cover|will not pay|maximum limit)\b', re.IGNORECASE),
        ]

        # Policy terms patterns
        self.policy_terms_patterns = [
            re.compile(r'\b(?:sum insured|premium|deductible|copay|coverage)\b', re.IGNORECASE),
            re.compile(r'\b(?:proportionate deduction|room rent|icu charges|daycare)\b', re.IGNORECASE),
            re.compile(r'\b(?:pre-hospitalization|post-hospitalization|waiting period)\b', re.IGNORECASE),
            re.compile(r'\b(?:cashless|reimbursement|ayush|ambulance|maternity)\b', re.IGNORECASE),
        ]

        # Cross-reference patterns
        self.cross_ref_patterns = [
            re.compile(r'(?:as mentioned in|refer to|as per|subject to|in accordance with)\s+(?:section|clause|table|point|paragraph)\s*[\d.]+', re.IGNORECASE),
            re.compile(r'(?:section|clause)\s*[\d.]+(?:\.\d+)*', re.IGNORECASE),
        ]

    def _setup_splitters(self):
        """Initialize LangChain text splitters"""

        # 1. Header-based splitter for hierarchical structure
        headers_to_split_on = [
            ("#", "Header 1"),
            ("##", "Header 2"),
            ("###", "Header 3"),
            ("####", "Header 4"),
        ]

        self.header_splitter = MarkdownHeaderTextSplitter(
            headers_to_split_on=headers_to_split_on,
            strip_headers=False
        )

        # 2. Recursive character splitter as fallback
        self.recursive_splitter = RecursiveCharacterTextSplitter(
            chunk_size=self.chunk_size,
            chunk_overlap=self.chunk_overlap,
            length_function=len,
            separators=["\n\n", "\n", ". ", " ", ""]
        )

        # 3. Semantic chunker for complex sections
        if self.use_semantic_chunker and self.embeddings:
            try:
                self.semantic_splitter = SemanticChunker(
                    embeddings=self.embeddings,
                    breakpoint_threshold_type="percentile",
                    breakpoint_threshold_amount=85
                )
            except Exception as e:
                print(f"Warning: Could not initialize semantic chunker: {e}")
                self.semantic_splitter = None
        else:
            self.semantic_splitter = None

    def split_text(self, text: str) -> List[str]:
        """Main interface method - returns list of chunk texts"""
        chunks_with_metadata = self.chunk_with_metadata(text)
        return [chunk_text for chunk_text, _ in chunks_with_metadata]

    def chunk_with_metadata(self, text: str) -> List[Tuple[str, ChunkMetadata]]:
        """Advanced chunking with comprehensive metadata"""

        print("Starting chunking process...")

        # Step 1: Preprocess Azure OCR text
        preprocessed_text = self._preprocess_azure_ocr_text(text)
        print(f"Text preprocessed. Length: {len(preprocessed_text)}")

        # Step 2: Extract and preserve tables
        tables, text_without_tables = self._extract_and_preserve_tables(preprocessed_text)
        print(f"Found {len(tables)} tables")

        # Step 3: Use MarkdownHeaderTextSplitter for hierarchical structure
        header_chunks = self._split_by_headers(text_without_tables)
        print(f"Created {len(header_chunks)} header-based chunks")

        # Step 4: Apply semantic chunking to complex sections
        refined_chunks = self._apply_semantic_refinement(header_chunks)
        print(f"Refined to {len(refined_chunks)} chunks")

        # Step 5: Process tables as separate chunks
        table_chunks = self._process_table_chunks(tables)
        print(f"Created {len(table_chunks)} table chunks")

        # Step 6: Combine and optimize all chunks
        all_chunks = refined_chunks + table_chunks
        final_chunks = self._post_process_chunks(all_chunks)
        print(f"Final processing complete. Total chunks: {len(final_chunks)}")

        return final_chunks

    def _preprocess_azure_ocr_text(self, text: str) -> str:
        """Preprocess Azure OCR text to clean up XML tags and formatting"""

        # Convert XML-like bold tags to markdown
        for pattern in self.emphasis_patterns:
            text = pattern.sub(r'**\1**', text)

        # Clean up common Azure OCR artifacts
        text = re.sub(r'</?p[^>]*>', '', text)
        text = re.sub(r'</?div[^>]*>', '', text)
        text = re.sub(r'<br[^>]*/?>', '\n', text)

        # Normalize whitespace
        text = re.sub(r'\n\s*\n\s*\n', '\n\n', text)
        text = re.sub(r'[ \t]+', ' ', text)

        # Convert numbered sections to markdown headers
        text = re.sub(r'^(\d+\.\s+[A-Z\s]+):?\s*$', r'## \1', text, flags=re.MULTILINE)
        text = re.sub(r'^(\d+\.\d+\s+[A-Z\s]+):?\s*$', r'### \1', text, flags=re.MULTILINE)

        return text

    def _extract_and_preserve_tables(self, text: str) -> Tuple[List[Dict], str]:
        """Extract tables and replace with placeholders"""
        tables = []
        text_without_tables = text

        for i, pattern in enumerate(self.table_patterns):
            matches = list(pattern.finditer(text_without_tables))

            for j, match in enumerate(reversed(matches)):
                table_text = match.group(0)
                start_pos = match.start()
                end_pos = match.end()

                importance_score = self._calculate_table_importance(table_text)

                table_info = {
                    'id': f'table_{len(tables)}',
                    'text': table_text,
                    'start_pos': start_pos,
                    'end_pos': end_pos,
                    'importance_score': importance_score,
                    'pattern_type': i,
                    'position': len(tables),
                }

                tables.append(table_info)

                placeholder = f"\n\n[TABLE_{table_info['id']}_PLACEHOLDER]\n\n"
                text_without_tables = (text_without_tables[:start_pos] +
                                     placeholder +
                                     text_without_tables[end_pos:])

        return tables, text_without_tables

    def _calculate_table_importance(self, table_text: str) -> float:
        """Calculate importance score for tables"""
        score = 5.0  # Base score for being a table

        # Monetary values increase importance
        monetary_matches = sum(len(p.findall(table_text)) for p in self.monetary_patterns)
        score += monetary_matches * 3.0

        # Policy terms add importance
        policy_matches = sum(len(p.findall(table_text)) for p in self.policy_terms_patterns)
        score += policy_matches * 2.0

        # Exclusion terms are critical
        exclusion_matches = sum(len(p.findall(table_text)) for p in self.exclusion_patterns)
        score += exclusion_matches * 4.0

        return score

    def _split_by_headers(self, text: str) -> List[Dict]:
        """Split text using MarkdownHeaderTextSplitter"""

        try:
            header_splits = self.header_splitter.split_text(text)

            chunks = []
            for i, doc in enumerate(header_splits):
                chunk_info = {
                    'text': doc.page_content,
                    'metadata': doc.metadata,
                    'chunk_type': 'header_section',
                    'position': i,
                    'importance_score': self._calculate_section_importance(doc.page_content),
                    'headers': list(doc.metadata.values()) if doc.metadata else []
                }
                chunks.append(chunk_info)

            return chunks

        except Exception as e:
            print(f"Header splitting failed: {e}. Using recursive splitter.")

            recursive_splits = self.recursive_splitter.split_text(text)
            chunks = []
            for i, chunk_text in enumerate(recursive_splits):
                chunk_info = {
                    'text': chunk_text,
                    'metadata': {},
                    'chunk_type': 'recursive_fallback',
                    'position': i,
                    'importance_score': self._calculate_section_importance(chunk_text),
                    'headers': []
                }
                chunks.append(chunk_info)

            return chunks

    def _apply_semantic_refinement(self, header_chunks: List[Dict]) -> List[Dict]:
        """Apply semantic chunking to complex sections"""

        if not self.semantic_splitter:
            return header_chunks

        refined_chunks = []

        for chunk in header_chunks:
            chunk_text = chunk['text']

            if (len(chunk_text) > self.chunk_size * 1.5 and
                chunk['importance_score'] > 5.0):

                try:
                    semantic_splits = self.semantic_splitter.split_text(chunk_text)

                    for i, semantic_chunk in enumerate(semantic_splits):
                        refined_chunk = chunk.copy()
                        refined_chunk['text'] = semantic_chunk
                        refined_chunk['chunk_type'] = 'semantic_refined'
                        refined_chunk['position'] = f"{chunk['position']}.{i}"
                        refined_chunk['importance_score'] = self._calculate_section_importance(semantic_chunk)
                        refined_chunks.append(refined_chunk)

                except Exception as e:
                    print(f"Semantic chunking failed for chunk {chunk['position']}: {e}")
                    refined_chunks.append(chunk)
            else:
                refined_chunks.append(chunk)

        return refined_chunks

    def _process_table_chunks(self, tables: List[Dict]) -> List[Dict]:
        """Process all tables into chunks"""
        all_table_chunks = []
        for table_info in tables:
            # Simple table processing - treat each table as a single chunk
            table_chunk = {
                'text': table_info['text'],
                'chunk_type': 'table',
                'table_id': table_info['id'],
                'importance_score': table_info['importance_score'],
                'position': table_info.get('position', 0),
            }
            all_table_chunks.append(table_chunk)
        return all_table_chunks

    def _calculate_section_importance(self, text: str) -> float:
        """Calculate importance score for text sections"""

        score = 1.0

        # Monetary values
        monetary_count = sum(len(p.findall(text)) for p in self.monetary_patterns)
        score += monetary_count * 2.0

        # Policy terms
        policy_count = sum(len(p.findall(text)) for p in self.policy_terms_patterns)
        score += policy_count * 1.5

        # Exclusions
        exclusion_count = sum(len(p.findall(text)) for p in self.exclusion_patterns)
        score += exclusion_count * 3.0

        # Cross-references
        cross_ref_count = sum(len(p.findall(text)) for p in self.cross_ref_patterns)
        score += cross_ref_count * 1.0

        return score

    def _post_process_chunks(self, all_chunks: List[Dict]) -> List[Tuple[str, ChunkMetadata]]:
        """Final processing and metadata creation"""

        final_chunks = []

        # Sort by importance
        sorted_chunks = sorted(all_chunks, key=lambda x: x['importance_score'], reverse=True)

        for i, chunk_info in enumerate(sorted_chunks):
            chunk_text = chunk_info['text']

            # Skip tiny chunks unless they're tables
            if len(chunk_text.strip()) < self.min_chunk_size and chunk_info['chunk_type'] != 'table':
                continue

            # Extract metadata
            monetary_amounts = []
            for pattern in self.monetary_patterns:
                monetary_amounts.extend(pattern.findall(chunk_text))

            exclusion_phrases = []
            for pattern in self.exclusion_patterns:
                exclusion_phrases.extend(pattern.findall(chunk_text))

            policy_terms = []
            for pattern in self.policy_terms_patterns:
                policy_terms.extend(pattern.findall(chunk_text))

            cross_references = []
            for pattern in self.cross_ref_patterns:
                cross_references.extend(pattern.findall(chunk_text))

            # Create metadata
            metadata = ChunkMetadata(
                chunk_id=f"chunk_{i}",
                chunk_type=chunk_info['chunk_type'],
                importance_score=chunk_info['importance_score'],
                section_hierarchy=chunk_info.get('headers', []),
                has_tables='[TABLE_' in chunk_text,
                table_count=chunk_text.count('[TABLE_'),
                has_monetary_values=len(monetary_amounts) > 0,
                monetary_amounts=monetary_amounts[:5],
                has_exclusions=len(exclusion_phrases) > 0,
                exclusion_phrases=exclusion_phrases[:3],
                policy_terms=policy_terms[:5],
                cross_references=cross_references[:3],
                original_headers=chunk_info.get('headers', []),
                chunk_position=i,
                source_lines=(0, 0)
            )

            final_chunks.append((chunk_text, metadata))

        return final_chunks

def print_chunk_analysis(chunks_with_metadata: List[Tuple[str, ChunkMetadata]], max_chunks: int = 5):
    """Print analysis of chunks"""

    print(f"\n{'='*80}")
    print(f"CHUNK ANALYSIS - Showing top {min(max_chunks, len(chunks_with_metadata))} chunks")
    print(f"{'='*80}")

    for i, (chunk_text, metadata) in enumerate(chunks_with_metadata[:max_chunks]):
        print(f"\n{'-'*60}")
        print(f"CHUNK {i+1}/{len(chunks_with_metadata)}")
        print(f"ID: {metadata.chunk_id}")
        print(f"Type: {metadata.chunk_type}")
        print(f"Importance Score: {metadata.importance_score:.2f}")
        print(f"Length: {len(chunk_text)} characters")

        if metadata.section_hierarchy:
            print(f"Headers: {' > '.join(metadata.section_hierarchy)}")

        if metadata.has_monetary_values:
            print(f"Monetary Values: {metadata.monetary_amounts}")

        if metadata.has_exclusions:
            print(f"Exclusions Found: {metadata.exclusion_phrases}")

        if metadata.policy_terms:
            print(f"Policy Terms: {metadata.policy_terms}")

        print(f"\nCONTENT:")
        print(f"{'-'*20}")
        # Show first 500 characters
        content_preview = chunk_text[:500]
        if len(chunk_text) > 500:
            content_preview += "..."
        print(content_preview)

        print(f"{'-'*60}")

def main():
    """Main function to run the chunking demo"""

    print("Azure OCR + Advanced Chunking Demo")
    print("="*50)

    # Configuration - Replace with your actual credentials
    AZURE_ENDPOINT = input("Enter your Azure Document Intelligence Endpoint: ").strip()
    AZURE_KEY = input("Enter your Azure Document Intelligence Key: ").strip()
    OPENAI_KEY = input("Enter your OpenAI API Key (optional, press Enter to skip): ").strip()

    if not AZURE_ENDPOINT or not AZURE_KEY:
        print("Error: Azure credentials are required!")
        return

    # Initialize settings
    settings = Settings(AZURE_ENDPOINT, AZURE_KEY, OPENAI_KEY if OPENAI_KEY else None)

    # Get file path
    file_path = input("Enter the path to your document: ").strip()

    if not os.path.exists(file_path):
        print(f"Error: File not found at {file_path}")
        return

    try:
        # Step 1: Process document with Azure OCR
        print(f"\nProcessing document with Azure OCR...")
        processor = AzureDocumentProcessor(
            api_endpoint=AZURE_ENDPOINT,
            api_key=AZURE_KEY,
            mode="markdown"  # Use markdown mode for better structure
        )

        documents = processor.process_file(file_path)
        result = processor.extract_text_and_metadata(documents)

        print(f"Document processed successfully!")
        print(f"Pages: {result['page_count']}")
        print(f"Text length: {len(result['text'])} characters")

        # Step 2: Initialize chunker
        print(f"\nInitializing advanced chunker...")
        chunker = InsuranceDocumentChunker(
            chunk_size=1200,
            chunk_overlap=150,
            min_chunk_size=200,
            use_semantic_chunker=bool(OPENAI_KEY),
            settings=settings
        )

        # Step 3: Chunk the document
        print(f"\nChunking document...")
        chunks_with_metadata = chunker.chunk_with_metadata(result['text'])

        # Step 4: Display results
        print_chunk_analysis(chunks_with_metadata, max_chunks=8)

        # Summary statistics
        print(f"\n{'='*80}")
        print("SUMMARY STATISTICS")
        print(f"{'='*80}")
        print(f"Total chunks created: {len(chunks_with_metadata)}")

        chunk_types = {}
        total_importance = 0
        for _, metadata in chunks_with_metadata:
            chunk_types[metadata.chunk_type] = chunk_types.get(metadata.chunk_type, 0) + 1
            total_importance += metadata.importance_score

        print(f"Chunk types: {dict(chunk_types)}")
        print(f"Average importance score: {total_importance/len(chunks_with_metadata):.2f}")

        monetary_chunks = sum(1 for _, meta in chunks_with_metadata if meta.has_monetary_values)
        exclusion_chunks = sum(1 for _, meta in chunks_with_metadata if meta.has_exclusions)
        table_chunks = sum(1 for _, meta in chunks_with_metadata if meta.has_tables)

        print(f"Chunks with monetary values: {monetary_chunks}")
        print(f"Chunks with exclusions: {exclusion_chunks}")
        print(f"Chunks with tables: {table_chunks}")

    except Exception as e:
        print(f"Error occurred: {str(e)}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()