In [None]:
"""
Costa Rica Philatelic Knowledge Graph Extractor - EXPERT VERSION
Handles all Costa Rican denomination systems and catalog syntax
By: Philately & Regex Expert
"""

import json
import re
from typing import Dict, List, Tuple, Optional, Set
from dataclasses import dataclass, asdict, field
from bs4 import BeautifulSoup
import os

import os
import json
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
from matplotlib.patches import FancyBboxPatch
from collections import defaultdict
import pandas as pd

In [None]:
#!pip install networkx matplotlib pandas numpy

In [None]:

@dataclass
class PhilatelicNode:
    """Represents a philatelic entity node"""
    catalog_number: str
    node_type: str  # stamp, proof, specimen, essay, variety, error
    sub_type: Optional[str] = None  # die_proof, plate_proof, color_variety, etc.
    denomination: Optional[str] = None
    color: Optional[str] = None
    year: Optional[str] = None
    issue_name: Optional[str] = None  # From most recent <sec> element
    quantity: Optional[str] = None
    page_number: int = 0
    reading_order: int = 0
    raw_text: str = ""
    context_before: List[str] = field(default_factory=list)
    context_after: List[str] = field(default_factory=list)
    attributes: List[str] = field(default_factory=list)
    reference_numbers: List[str] = field(default_factory=list)
    base_stamp: Optional[str] = None  # For proofs/varieties: which stamp they belong to
    
    def to_dict(self):
        return {k: v for k, v in asdict(self).items() if v not in [None, [], ""]}

class CostaRicaCatalogParser:
    """Expert parser for Costa Rica philatelic catalog with full denomination support"""
    
    # COMPREHENSIVE DENOMINATION REGEX
    # Handles: centavos(c), Colones(C/col), pesos(p), reales(r), fractions(½, 1/2)
    # Full words: centimos, centavos, colones, pesos, reales
    DENOMINATION = re.compile(
        r'\b('  # Captura grupo 1: el valor
            r'\d+(?:[/.]?\d+)?'  # Números normales: 15, 1.5, 1/2
            r'|½|¼|¾'            # O símbolos de fracción standalone
        r')\s*'
        r'('  # Captura grupo 2: la unidad
            r'c|C|col|colón|colones|p|peso|pesos|r|real|reales|'
            r'centimo|centimos|céntimo|céntimos|centavo|centavos|céntavo|céntavos'
        r')\b',
        re.IGNORECASE
    )
    
    # Surcharge pattern: "1c on ½ real", "5c on 2 reales", etc.
    SURCHARGE = re.compile(
        r'(\d+)\s*c\s+on\s+([½¼¾\d/]+)\s*(real|reales)',
        re.IGNORECASE
    )
    
    # More precise: only match denomination after catalog number, not standalone
    DENOMINATION_AFTER_CATALOG = re.compile(
        r'(?:DP|PP|S|E|I|OP|^|\s)(\d+[a-z]?)\s+'  # Catalog number
        r'(\d+(?:[/.]?\d+|½|¼)?)\s*'  # Denomination value
        r'(c|C|col|colón|colones|p|peso|pesos|r|real|reales|'
        r'centimo|centimos|céntimo|céntimos|centavo|centavos)\b',
        re.IGNORECASE
    )
    
    # Color patterns - comprehensive
    COLOR_MODIFIERS = r'(deep|light|dark|bright|pale|dull|bright)'
    COLOR_NAMES = r'(black|violet|blue|red|green|orange|brown|yellow|gray|grey|' \
                  r'scarlet|ultramarine|carmine|olive|purple|pink|rose|magenta|' \
                  r'turquoise|indigo|vermillion|sepia|slate|cobalt|crimson)'
    COLORS = re.compile(rf'\b{COLOR_MODIFIERS}?\s*{COLOR_NAMES}\b', re.IGNORECASE)
    
    YEAR = re.compile(r'\b(18\d{2}|19\d{2}|20\d{2})\b')
    
    # Quantity: numbers with commas, but NOT single/double digit numbers
    # Must have comma OR be 4+ digits OR have "printed"/"issued" keyword
    QUANTITY = re.compile(
        r'\b(\d{1,3}(?:,\d{3})+)\b|'  # Try comma pattern FIRST
        r'(?:printed|issued|quantity)[:\s]+(\d{1,3}(?:,\d{3})+)\b',  # ← Añadir soporte para comas aquí también
        re.IGNORECASE
    )
    
    REFERENCE = re.compile(r'#([\w-]+)')
    
    ATTRIBUTES = [
        'imperf', 'imperforate', 'inverted', 'double perf', 'triple perf',
        'tete beche', 'tete-beche', 'téte-béche', 'shifted', 'gutter pair', 
        'sunk on card', 'no numeral', 'omitted center', 'inverted center', 
        'double op', 'horizontal pair', 'vertical pair', 'right margin', 
        'left margin', 'lower margin', 'upper margin', 'inverted op',
        'misplaced', 'offset', 'printed on both sides', 'blind perf',
        'photographic proof'
    ]
    
    def __init__(self):
        self.current_issue = None  # Most recent <sec> element
        self.current_sec = None          
        self.current_sub_sec = None      
        self.current_sub_sub_sec = None  
        self.current_section = None  # Proof, Regular issue, etc.
        self.current_year = None
        self.stamps_seen = set()
    
    def clean_latex(self, text: str) -> str:
        """Remove LaTeX formatting and extract content"""
        if '\\begin{array}' in text:
            match = re.search(r'\\begin\{array\}[^{]*\{(.*?)\\end\{array\}', text, re.DOTALL)
            if match:
                content = match.group(1)
                rows = content.split('\\\\')
                cleaned_rows = []
                for row in rows:
                    row = re.sub(r'\\text\s*\{\s*([^}]*)\s*\}', r'\1', row)
                    row = re.sub(r'\\mathrm\s*\{\s*([^}]*)\s*\}', r'\1', row)
                    row = re.sub(r'&', ' ', row)
                    row = re.sub(r'\\[a-zA-Z]+', '', row)
                    row = re.sub(r'[{}$\\]', '', row)
                    row = re.sub(r'\s+', ' ', row).strip()
                    if row:
                        cleaned_rows.append(row)
                return '\n'.join(cleaned_rows)
        
        text = re.sub(r'\$\^?\{(\w+)\}\$', r'\1', text)
        text = re.sub(r'\\text\s*\{\s*([^}]*)\s*\}', r'\1', text)
        text = re.sub(r'\\mathrm\s*\{\s*([^}]*)\s*\}', r'\1', text)
        text = re.sub(r'\\[a-zA-Z]+\{([^}]*)\}', r'\1', text)
        text = re.sub(r'\\[a-zA-Z]+', '', text)
        text = re.sub(r'[{}$]', '', text)
        text = re.sub(r'\s+', ' ', text)
        
        return text.strip()
    
    def normalize_denomination(self, value: str, unit: str) -> str:
        """Normalize denomination to standard format"""
        
        # CRITICAL: Check for capital C BEFORE any conversion
        if unit == 'C':  # Capital C = Colones
            unit_normalized = 'col'
        elif unit.lower() in ['c', 'centimo', 'centimos', 'céntimo', 'céntimos', 
                            'centavo', 'centavos', 'céntavo', 'céntavos']:
            unit_normalized = 'c'
        elif unit.lower() in ['col', 'colón', 'colones']:
            unit_normalized = 'col'
        elif unit.lower() in ['p', 'peso', 'pesos']:
            unit_normalized = 'p'
        elif unit.lower() in ['r', 'real', 'reales']:
            unit_normalized = 'r'
        else:
            unit_normalized = unit.lower()
        
        # Handle fractions
        if value == '½':
            value_normalized = '0.5'
        elif value == '¼':
            value_normalized = '0.25'
        elif value == '¾':
            value_normalized = '0.75'
        elif '/' in value:
            try:
                parts = value.split('/')
                result = float(parts[0]) / float(parts[1])
                value_normalized = str(result)  # str(float) automáticamente pone 0.25
            except:
                value_normalized = value
        else:
            value_normalized = value
        
        return f"{value_normalized}{unit_normalized}"
    
    def extract_base_stamp_number(self, catalog_num: str) -> Optional[str]:
        """Extract base stamp number from proof/specimen/variety"""
        match = re.match(r'[A-Z]*(\d+)[a-zA-Z]*$', catalog_num, re.IGNORECASE)
        if match:
            return match.group(1)
        return None
    
    def extract_stamp_color(self, line: str, catalog_num: str) -> Optional[str]:
        """
        Extract color of stamp, avoiding colors of overprints, frames, values.
        Expert strategy: Get color between denomination and first delimiter.
        """
        # Find text segment after catalog and denomination
        # Pattern: [catalog] [denom] [COLOR SHOULD BE HERE] [, or 'op' or other delimiter]
        
        # Escape special regex chars in catalog_num
        cat_escaped = re.escape(catalog_num)
        
        # Try to find: catalog_num + any_denomination + color_text
        pattern = rf'{cat_escaped}\s+\d+(?:[/.]?\d+)?\s*(?:c|C|col|p|r|centimos?|centavos?|colones?|pesos?|reales?)\s+([^,]+?)(?:,|\bop\b|\bimperf|\bperf|\bframe\b|\bvalue\b|$)'
        match = re.search(pattern, line, re.IGNORECASE)
        
        if match:
            color_section = match.group(1).strip()
            
            # Stop at certain keywords
            stop_words = ['op', 'specimen', 'frame', 'value', 'imperf', 'perf', 
                         'inverted', 'double', 'triple', 'sunk', 'with', 'hole',
                         'margin', 'pair', '#', 'quantity', 'printed']
            
            color_parts = []
            for word in color_section.split():
                if word.lower() in stop_words:
                    break
                color_parts.append(word)
            
            if color_parts:
                color_text = ' '.join(color_parts).strip()
                # Verify it contains actual color
                if self.COLORS.search(color_text):
                    return color_text
        
        # Fallback: Get first color before comma or 'op'
        text_before_delim = re.split(r',|\bop\b', line, maxsplit=1)[0]
        
        # But after the denomination
        denom_match = re.search(r'\d+(?:[/.]?\d+)?\s*(?:c|C|col|p|r|centim|centav|colon|peso|real)', 
                               text_before_delim, re.IGNORECASE)
        if denom_match:
            text_after_denom = text_before_delim[denom_match.end():].strip()
            colors_matches = self.COLORS.findall(text_after_denom)
            if colors_matches:
                if isinstance(colors_matches[0], tuple):
                    color_parts = [p.strip() for p in colors_matches[0] if p.strip()]
                    return ' '.join(color_parts) if color_parts else None
                return colors_matches[0]
        
        return None
    
    def extract_attributes(self, text: str) -> List[str]:
        """Extract philatelic attributes including overprint colors"""
        text_lower = text.lower()
        found = []
        
        for attr in self.ATTRIBUTES:
            if attr in text_lower:
                found.append(attr)
        
        # Special: overprint colors (store as attribute, not as stamp color)
        if 'op' in text_lower or 'overprint' in text_lower:
            # Pattern: "blue op" or "op in red" or "red overprint"
            op_color_patterns = [
                r'(black|blue|red|green|violet|brown|orange|yellow)\s+op\b',
                r'\bop\b.*?\bin\s+(black|blue|red|green|violet|brown|orange|yellow)',
                r'(black|blue|red|green|violet|brown|orange|yellow)\s+overprint'
            ]
            
            for pattern in op_color_patterns:
                matches = re.finditer(pattern, text_lower)
                for match in matches:
                    color = match.group(1)
                    attr_name = f'overprint_{color}'
                    if attr_name not in found:
                        found.append(attr_name)
        
        return found
    
    def parse_catalog_entry(self, line: str) -> List[Dict]:
        """Parse line and extract catalog entries with proper denomination handling"""
        entries = []
        
        # OCR CORRECTION
        line = re.sub(r'\b(DP|PP|S)I(\d)', r'\g<1>1\2', line)
        line = re.sub(r'\b(DP|PP|S)I([a-z])', r'\g<1>1\2', line)
        line = re.sub(r'\b(S|PP|DP)l([a-z])', r'\g<1>1\2', line)
        
        # FILTERS
        if re.search(r'Decree[s]?\s*#\s*\d+', line, re.IGNORECASE):
            return entries
        if re.search(r'\(Ref\s+[A-Z]+\s+\d+', line, re.IGNORECASE):
            return entries
        if re.search(r'Accord\s+\d+', line, re.IGNORECASE):
            return entries
        
        # Filter OCR corruption (2011 repeated 3+ times)
        match = re.search(r'\b(\d+[a-zA-Z]*)\b', line)
        if match:
            first_cat = match.group(1)
            count = len(re.findall(rf'\b{re.escape(first_cat)}\b', line))
            if count >= 3:
                return entries
        
        # NEW: Handle bare numbers separated by newlines (e.g., "5\n\n\n6\n\n\n7")
        bare_numbers = re.findall(r'^(\d+)$', line, re.MULTILINE)
        if bare_numbers and len(bare_numbers) >= 2:
            for num in bare_numbers:
                if int(num) <= 500:  # Reasonable stamp number
                    entries.append({
                        'catalog_number': num,
                        'node_type': 'stamp',
                        'sub_type': 'regular_issue',
                        'base_stamp': None
                    })
            return entries
        
        # NEW: Handle concatenated M entries (M12M12a1c green...)
        # Pattern: M## followed immediately by M##a or more M entries
        m_pattern = r'\b(M\d+[a-z]?)\b'
        m_matches = list(re.finditer(m_pattern, line, re.IGNORECASE))
        if len(m_matches) >= 2:
            for m_match in m_matches:
                cat_num = m_match.group(1).upper()
                base = self.extract_base_stamp_number(cat_num)
                if cat_num not in [e['catalog_number'] for e in entries]:
                    entries.append({
                        'catalog_number': cat_num,
                        'node_type': 'specimen',
                        'sub_type': 'specimen',
                        'base_stamp': base
                    })
            # If we found M entries, return early to avoid duplicate processing
            if entries:
                return entries
        
        # NEW: Handle varieties with space after number (LaTeX cleaned: "7 a surcharge...")
        # This handles the pattern after clean_latex converts "${ }^{7} \\mathrm{a}" to "7 a"
        variety_space_pattern = r'\b(\d+)\s+([a-z])\b'
        for match in re.finditer(variety_space_pattern, line):
            num = match.group(1)
            letter = match.group(2)
            cat_num = f"{num}{letter}"
            # Skip if this looks like a dimension or other non-catalog pattern
            if int(num) > 2000:
                continue
            if cat_num not in [e['catalog_number'] for e in entries]:
                entries.append({
                    'catalog_number': cat_num,
                    'node_type': 'variety',
                    'sub_type': 'variety',
                    'base_stamp': num
                })
        
        # Die Proof: DP###
        for match in re.finditer(r'\bDP(\d+[a-zA-Z]*)\b', line, re.IGNORECASE):
            cat_num = f"DP{match.group(1)}"
            base = self.extract_base_stamp_number(cat_num)
            entries.append({
                'catalog_number': cat_num,
                'node_type': 'proof',
                'sub_type': 'die_proof',
                'base_stamp': base
            })
        
        # Plate Proof: PP###
        for match in re.finditer(r'\bPP(\d+[a-zA-Z]*)\b', line, re.IGNORECASE):
            cat_num = f"PP{match.group(1)}"
            base = self.extract_base_stamp_number(cat_num)
            entries.append({
                'catalog_number': cat_num,
                'node_type': 'proof',
                'sub_type': 'plate_proof',
                'base_stamp': base
            })
        
        # Color Proof: CP###
        for match in re.finditer(r'\bCP(\d+[a-zA-Z]*)\b', line, re.IGNORECASE):
            cat_num = f"CP{match.group(1)}"
            base = self.extract_base_stamp_number(cat_num)
            entries.append({
                'catalog_number': cat_num,
                'node_type': 'proof',
                'sub_type': 'color_proof',
                'base_stamp': base
            })
        
        # Specimen: S### (but not in concatenated M pattern already handled)
        for match in re.finditer(r'\bS(\d+[a-zA-Z]*)\b', line, re.IGNORECASE):
            cat_num = f"S{match.group(1)}"
            base = self.extract_base_stamp_number(cat_num)
            entries.append({
                'catalog_number': cat_num,
                'node_type': 'specimen',
                'sub_type': 'specimen',
                'base_stamp': base
            })
        
        # Essay: E###
        for match in re.finditer(r'\bE([I\d]+[a-zA-Z]*)\b', line, re.IGNORECASE):
            cat_num = f"E{match.group(1)}"
            base = self.extract_base_stamp_number(cat_num)
            entries.append({
                'catalog_number': cat_num,
                'node_type': 'essay',
                'sub_type': 'essay',
                'base_stamp': base
            })
        
        # Imperforate: I###
        for match in re.finditer(r'\bI(\d+[a-zA-Z]*)\b', line, re.IGNORECASE):
            cat_num = f"I{match.group(1)}"
            base = self.extract_base_stamp_number(cat_num)
            entries.append({
                'catalog_number': cat_num,
                'node_type': 'variety',
                'sub_type': 'imperforate',
                'base_stamp': base
            })
        
        # Overprint Proof: OP###
        for match in re.finditer(r'\bOP(\d+[a-zA-Z]*)\b', line, re.IGNORECASE):
            cat_num = f"OP{match.group(1)}"
            base = self.extract_base_stamp_number(cat_num)
            entries.append({
                'catalog_number': cat_num,
                'node_type': 'proof',
                'sub_type': 'overprint_proof',
                'base_stamp': base
            })
        
        # Base stamp with denomination
        for match in re.finditer(
            r'(?:^|\s)(\d+[a-zA-Z]*)\s+(\d+(?:[/.]?\d+|½|¼)?)\s*'
            r'(c|C|col|colón|colones|p|peso|pesos|r|real|reales|'
            r'centim|céntim|centav|céntav)\b',
            line, re.IGNORECASE
        ):
            cat_num = match.group(1).strip()
            denom_value = match.group(2)
            
            try:
                if int(denom_value.split('.')[0].split('/')[0]) > 500:
                    continue
            except:
                pass
            
            if any(e['catalog_number'] == cat_num for e in entries):
                continue
            
            if re.match(r'\d+[a-zA-Z]+$', cat_num, re.IGNORECASE):
                base = self.extract_base_stamp_number(cat_num)
                entries.append({
                    'catalog_number': cat_num,
                    'node_type': 'variety',
                    'sub_type': 'variety',
                    'base_stamp': base
                })
            else:
                entries.append({
                    'catalog_number': cat_num,
                    'node_type': 'stamp',
                    'sub_type': 'regular_issue',
                    'base_stamp': None
                })
        
        return entries
    def extract_denominations(self, line: str) -> List[Tuple[str, str]]:
        """Extract denominations with their normalized values."""
        denoms = []
        
        for match in self.DENOMINATION.finditer(line):
            value = match.group(1)
            unit = match.group(2)
            
            # ONLY skip if it's EXACTLY a 2-character catalog pattern at word boundary
            # AND appears at the START of text (catalog numbers come first)
            matched_text = match.group(0)
            
            # Pattern: "1c" or "2a" appearing isolated (not "5c" in middle of sentence)
            if re.match(r'^[1-9][a-z]$', matched_text):
                # Check if this appears at the very start or after whitespace only
                match_pos = match.start()
                prefix = line[:match_pos].strip()
                
                # If there's meaningful text before this match, it's likely a real denomination
                if prefix and not re.match(r'^\d+$', prefix):
                    # Text before suggests this is mid-sentence denomination, keep it
                    pass
                elif not prefix or re.match(r'^\d+$', prefix):
                    # This is at start or after just a number - might be catalog number
                    # Skip ONLY if followed by space and descriptive text (not another number)
                    suffix = line[match.end():match.end()+20].strip()
                    if suffix and re.match(r'^[a-zA-Z]', suffix):
                        # Followed by letters = likely catalog number like "1c double"
                        continue
            
            # Skip quantities
            try:
                num_val = float(value.replace('½', '.5').replace('¼', '.25').split('/')[0])
                if num_val > 500:
                    continue
            except:
                pass
            
            denoms.append((value, unit))
        
        return denoms
    
    def parse_element(self, element: Dict, context_before: List[str], 
                 context_after: List[str], page_number: int) -> List[PhilatelicNode]:
        """Parse element and extract nodes - EXPERT VERSION"""
        text = element.get('text', '')
        label = element.get('label', '')
        reading_order = element.get('reading_order', 0)
        
        if label in ['header', 'foot', 'fig']:
            return []
        
        # CRITICAL: <sec>, <sub_sec>, <sub_sub_sec> define hierarchy
        # Priority: sub_sub_sec > sub_sec > sec
        if label == 'sec':
            self.current_sec = text
            if not hasattr(self, 'current_sub_sec') or not self.current_sub_sec:
                self.current_issue = text
            year_match = self.YEAR.search(text)
            if year_match:
                self.current_year = year_match.group(1)
            return []

        if label == 'sub_sec':
            self.current_sub_sec = text
            if not hasattr(self, 'current_sub_sub_sec') or not self.current_sub_sub_sec:
                self.current_issue = text
            return []

        if label == 'sub_sub_sec':
            self.current_sub_sub_sec = text
            self.current_issue = text
            return []
        
        # Section markers
        text_lower = text.lower().strip()
        if any(section in text_lower for section in ['proof', 'proofs', 'regular issue', 'essays', 'essay', 'plate proof', 'die proof', 'specimen', 'photographic proof']):
            for section in ['proof', 'proofs', 'regular issue', 'essays', 'essay', 'plate proof', 'die proof', 'specimen', 'regular issues', 'photographic proof', 'photographic proofs']:
                if section in text_lower:
                    self.current_section = section
                    break
        
        nodes = []
        
        # Handle tables
        if label == 'tab':
            nodes.extend(self._parse_table(text, page_number, reading_order, context_before))
            return nodes
        
        # Check if paragraph contains catalog entries
        # Special handling for bare catalog numbers separated by newlines
        if label == 'para':
            lines_temp = [l.strip() for l in text.split('\n') if l.strip()]
            bare_num_pattern = r'^\d+$'
            
            # Check if ALL lines are just bare numbers
            if len(lines_temp) >= 2 and all(re.match(bare_num_pattern, l) for l in lines_temp):
                for num in lines_temp:
                    if 1 <= int(num) <= 500:
                        node = PhilatelicNode(
                            catalog_number=num,
                            node_type='stamp',
                            sub_type='regular_issue',
                            denomination=None,
                            color=None,
                            year=self.current_year,
                            issue_name=self.current_issue,
                            quantity=None,
                            page_number=page_number,
                            reading_order=reading_order,
                            raw_text=text,
                            context_before=context_before[-3:],
                            context_after=context_after[:1],
                            attributes=[],
                            reference_numbers=[],
                            base_stamp=None
                        )
                        nodes.append(node)
                return nodes  # Only return for THIS paragraph
    
            # If not bare numbers, check for paragraph entries
            if re.search(r'(?:^|\n)\s*[A-Z]*\d+[a-z]*\s+', text):
                nodes.extend(self._parse_paragraph_entries(
                    text, page_number, reading_order, context_before
                ))
                if nodes:
                    return nodes
        
        # Clean and process remaining content
        clean_text = self.clean_latex(text)
        
        # DEBUG: Print cleaned text for specific problematic elements
        if page_number in [10, 11] and label == 'para' and reading_order in [17, 10, 11]:
            print(f"\n=== DEBUG PAGE {page_number}, ORDER {reading_order} ===")
            print(f"ORIGINAL: {text[:200]}")
            print(f"CLEANED:  {clean_text[:200]}")
            print(f"===\n")
        
        lines = [line.strip() for line in clean_text.split('\n') if line.strip()]
        
        for line in lines:
            entries = self.parse_catalog_entry(line)
            
            if not entries:
                continue
            
            # Extract denominations
            denoms_raw = self.extract_denominations(line)
            denominations = [self.normalize_denomination(v, u) for v, u in denoms_raw]
            
            # Extract colors
            colors = []
            for entry in entries:
                cat_num = entry['catalog_number']
                color = self.extract_stamp_color(line, cat_num)
                colors.append(color)
            
            # Fallback color extraction
            if not any(colors):
                colors_matches = self.COLORS.findall(line)
                for color_match in colors_matches:
                    if isinstance(color_match, tuple):
                        color_parts = [p.strip() for p in color_match if p.strip()]
                        if color_parts:
                            colors.append(' '.join(color_parts))
                    else:
                        colors.append(color_match.strip())
            
            # Extract quantity
            quantity = None
            qty_match = self.QUANTITY.search(line)
            if qty_match:
                raw_quantity = qty_match.group(1) or qty_match.group(2)
                quantity = raw_quantity.replace(',', '')
            
            references = self.REFERENCE.findall(line)
            attributes = self.extract_attributes(line)
            
            # Create nodes
            for i, entry in enumerate(entries):
                cat_num = entry['catalog_number']
                node_type = entry['node_type']
                sub_type = entry['sub_type']
                base_stamp = entry['base_stamp']
                
                if node_type == 'stamp' and cat_num in self.stamps_seen:
                    continue
                
                node = PhilatelicNode(
                    catalog_number=cat_num,
                    node_type=node_type,
                    sub_type=sub_type,
                    denomination=denominations[0] if denominations else None,
                    color=colors[i] if i < len(colors) else (colors[0] if colors else None),
                    year=self.current_year,
                    issue_name=self.current_issue,
                    quantity=quantity if node_type == 'stamp' else None,
                    page_number=page_number,
                    reading_order=reading_order,
                    raw_text=line,
                    context_before=context_before[-3:],
                    context_after=context_after[:1],
                    attributes=attributes,
                    reference_numbers=references,
                    base_stamp=base_stamp
                )
                
                if node_type == 'stamp' and self.current_section and 'regular issue' in self.current_section.lower():
                    self.stamps_seen.add(cat_num)
                
                nodes.append(node)
        
        return nodes
    def _parse_paragraph_entries(self, text: str, page_number: int, 
                             reading_order: int, context: List[str]) -> List[PhilatelicNode]:
        """Parse catalog entries from paragraph format (not tables)"""
        nodes = []
        
        # Pattern: catalog number at start of line/sentence
        # Examples: "5    1c on ½r blue", "7a   surcharge on 1A"
        pattern = r'(?:^|\n)\s*([A-Z]*\d+[a-z]*)\s+(.*?)(?=\n\s*[A-Z]*\d+[a-z]*\s+|\Z)'
        
        for match in re.finditer(pattern, text, re.MULTILINE | re.DOTALL):
            cat_num = match.group(1).strip()
            description = match.group(2).strip()
            
            # Determine node type
            if cat_num.startswith(('DP', 'PP', 'CP')):
                node_type = 'proof'
                sub_type = {'DP': 'die_proof', 'PP': 'plate_proof', 'CP': 'color_proof'}[cat_num[:2]]
                base_stamp = self.extract_base_stamp_number(cat_num)
            elif cat_num.startswith(('S', 'M')):
                node_type = 'specimen'
                sub_type = 'specimen'
                base_stamp = self.extract_base_stamp_number(cat_num)
            elif re.match(r'^\d+[a-z]$', cat_num):
                node_type = 'variety'
                sub_type = 'variety'
                base_stamp = cat_num[:-1]
            else:
                node_type = 'stamp'
                sub_type = 'regular_issue'
                base_stamp = None
            
            # Extract denomination
            denoms_raw = self.extract_denominations(description)
            denomination = self.normalize_denomination(*denoms_raw[0]) if denoms_raw else None
            
            # Check for surcharge pattern: "Xc on Yr"
            surcharge_match = re.search(r'(\d+)\s*c\s+on\s+([½¼\d/]+)\s*([rp])', description)
            if surcharge_match:
                new_val = surcharge_match.group(1)
                old_val = surcharge_match.group(2)
                old_unit = surcharge_match.group(3)
                denomination = f"{new_val}c on {old_val}{old_unit}"
            
            # Extract color
            color = self.extract_stamp_color(description, cat_num)
            
            # Extract attributes
            attributes = self.extract_attributes(description)
            
            node = PhilatelicNode(
                catalog_number=cat_num,
                node_type=node_type,
                sub_type=sub_type,
                denomination=denomination,
                color=color,
                year=self.current_year,
                issue_name=self.current_issue,
                quantity=None,
                page_number=page_number,
                reading_order=reading_order,
                raw_text=description,
                context_before=context[-3:],
                attributes=attributes,
                base_stamp=base_stamp
            )
            nodes.append(node)
        
        return nodes
    
    def _parse_table(self, html_text: str, page_number: int, 
                reading_order: int, context: List[str]) -> List[PhilatelicNode]:
        """Parse table elements with expert denomination handling"""
        soup = BeautifulSoup(html_text, 'html.parser')
        rows = []
        
        for tr in soup.find_all('tr'):
            cells = [td.get_text(strip=True) for td in tr.find_all('td')]
            if cells and not all(c == '' for c in cells):
                rows.append(cells)
        
        nodes = []
        last_denomination = None
        
        for row in rows:
            created_catalog_numbers = set(node.catalog_number for node in nodes)
            
            if not row or len(row) < 2:
                continue
            
            print(f"DEBUG: Processing table row with {len(row)} cells: {row}")
            
            # Skip headers
            header_keywords = ['perf', 'imperf', 'date', 'order', 'plate', 'essays']
            if len(row) <= 5 and all(any(kw in cell.lower() for kw in header_keywords) for cell in row if cell):
                print(f"DEBUG: Skipping header row")
                continue
            
            first_cell = row[0].strip()
            entries = []
            
            # Case 1: 2-cell variety rows
            if len(row) == 2 and re.match(r'^\d+[a-zA-Z]+$', first_cell):
                base = self.extract_base_stamp_number(first_cell)
                entries = [{
                    'catalog_number': first_cell,
                    'node_type': 'variety',
                    'sub_type': 'variety',
                    'base_stamp': base
                }]
                row_text = ' '.join(row)
                print(f"DEBUG: 2-cell variety row: {first_cell} -> base {base}")
                last_denomination = None
            
            # Case 2: 2-cell bare number rows
            elif len(row) == 2 and re.match(r'^\d+$', first_cell):
                desc = row[1].lower()
                variety_keywords = ['perf', 'diagonal', 'horizontal', 'cracked', 'impression', 'pair', 'inverted']
                
                # Special case: catalog #1 is likely a real stamp even with variety keywords
                if first_cell == '1' and any(kw in desc for kw in variety_keywords):
                    entries = [{
                        'catalog_number': first_cell,
                        'node_type': 'stamp',
                        'sub_type': 'regular_issue',
                        'base_stamp': None
                    }]
                    row_text = ' '.join(row)
                    print(f"DEBUG: Special case - stamp #1")
                    last_denomination = None
                
                elif any(kw in desc for kw in variety_keywords):
                    print(f"DEBUG: Row {first_cell} appears to be variety with missing letter, skipping")
                    continue
                else:
                    entries = [{
                        'catalog_number': first_cell,
                        'node_type': 'stamp',
                        'sub_type': 'regular_issue',
                        'base_stamp': None
                    }]
                    row_text = ' '.join(row)
                    print(f"DEBUG: 2-cell stamp row: {first_cell}")
                    last_denomination = None
            
            # Case 3: Multi-cell rows (3+)
            elif len(row) >= 3 and re.match(r'^\d+[a-zA-Z]*$', first_cell):
                rest_of_row = ' '.join(row[1:])
                row_text_full = ' '.join(row)
                
                print(f"DEBUG: Found catalog number in first cell: {first_cell}")
                
                # Subcase 3a: It's a variety
                if re.match(r'^\d+[a-zA-Z]+$', first_cell):
                    base = self.extract_base_stamp_number(first_cell)
                    entries = [{
                        'catalog_number': first_cell,
                        'node_type': 'variety',
                        'sub_type': 'variety',
                        'base_stamp': base
                    }]
                    print(f"DEBUG: Identified as variety: {first_cell} -> base {base}")
                    
                    # Check column 2 for additional varieties
                    if len(row) >= 2:
                        second_cell = row[1].strip()
                        # Fixed version - only single letters or specific patterns:
                        additional_varieties = []
                        # Pattern 1: Single lowercase letter (a, b, c)
                        for match in re.finditer(r'\b([a-z])\b', second_cell):
                            additional_varieties.append(match.group(1))
                        # Pattern 2: Uppercase + lowercase (like "Aa" from "1Aa")
                        for match in re.finditer(r'([A-Z][a-z])', second_cell):
                            additional_varieties.append(match.group(1))
                        
                        # Filter out if it's part of a color word
                        color_words = ['dark', 'light', 'deep', 'pale', 'bright', 'red', 'green', 'blue', 
                                    'violet', 'yellow', 'orange', 'brown', 'black', 'white', 'gray', 'rose']
                        additional_varieties = [v for v in additional_varieties 
                                            if not any(word.startswith(v.lower()) for word in color_words)]
                        
                        for var_code in additional_varieties:
                            full_cat = f"{base}{var_code}"
                            # Skip duplicates
                            if full_cat not in [e['catalog_number'] for e in entries] and full_cat != first_cell and full_cat not in created_catalog_numbers:
                                entries.append({
                                    'catalog_number': full_cat,
                                    'node_type': 'variety',
                                    'sub_type': 'variety',
                                    'base_stamp': base
                                })
                                print(f"DEBUG: Additional variety: {full_cat}")
                
                # Subcase 3b: It's a stamp
                else:
                    denoms_raw = self.extract_denominations(rest_of_row)
                    
                    if denoms_raw:
                        # Found denomination in rest of row
                        entries = [{
                            'catalog_number': first_cell,
                            'node_type': 'stamp',
                            'sub_type': 'regular_issue',
                            'base_stamp': None
                        }]
                        print(f"DEBUG: Stamp with denomination in row: {first_cell}")
                    else:
                        # No denomination found - use heuristics
                        second_cell = row[1].strip() if len(row) > 1 else ""
                        third_col_text = ' '.join(row[2:]) if len(row) >= 3 else ""
                        
                        # Heuristic 1: Column 2 is a variety code
                        if re.match(r'^\d+[a-z]$', second_cell):
                            denoms_in_desc = self.extract_denominations(third_col_text)
                            last_col = row[-1].strip() if row else ""
                            
                            if denoms_in_desc:
                                entries = [{
                                    'catalog_number': first_cell,
                                    'node_type': 'stamp',
                                    'sub_type': 'regular_issue',
                                    'base_stamp': None
                                }]
                                rest_of_row = third_col_text
                                print(f"DEBUG: Stamp {first_cell} - denom in column 3")
                            elif re.match(r'^\d{2,3},\d{3}$', last_col):
                                entries = [{
                                    'catalog_number': first_cell,
                                    'node_type': 'stamp',
                                    'sub_type': 'regular_issue',
                                    'base_stamp': None
                                }]
                                rest_of_row = third_col_text
                                print(f"DEBUG: Stamp {first_cell} inferred from quantity")
                                last_denomination = None
                            else:
                                print(f"DEBUG: No denomination for {first_cell}, skipping")
                                continue
                            
                            # Add variety from column 2
                            base_of_variety = second_cell[:-1]
                            entries.append({
                                'catalog_number': second_cell,
                                'node_type': 'variety',
                                'sub_type': 'variety',
                                'base_stamp': base_of_variety
                            })
                            print(f"DEBUG: Added variety {second_cell}")
                        else:
                            print(f"DEBUG: No denomination for {first_cell}, skipping")
                            continue
                    
                    # Check for variety in same row (after stamp created)
                    if entries and entries[0]['node_type'] == 'stamp' and len(row) >= 2:
                        second_cell = row[1].strip()
                        if re.match(r'^\d+[a-z]$', second_cell) and second_cell not in [e['catalog_number'] for e in entries]:
                            base_of_variety = second_cell[:-1]
                            entries.append({
                                'catalog_number': second_cell,
                                'node_type': 'variety',
                                'sub_type': 'variety',
                                'base_stamp': base_of_variety
                            })
                            print(f"DEBUG: Variety {second_cell} in same row")
                
                row_text = row_text_full
            
            # Case 4: Normal parsing
            else:
                row_text = ' '.join(row)
                print(f"DEBUG: Normal parsing")
                entries = self.parse_catalog_entry(row_text)
            
            print(f"DEBUG: Found {len(entries)} entries: {[e['catalog_number'] for e in entries]}")
            
            if entries:
                surcharge_match = self.SURCHARGE.search(row_text)
                surcharge_info = None
                if surcharge_match:
                    new_value = surcharge_match.group(1)
                    old_value = surcharge_match.group(2)
                    old_unit = surcharge_match.group(3)
                    surcharge_info = f"{new_value}c on {old_value} {old_unit}"
                
                denoms_raw = self.extract_denominations(row_text)
                denominations = [self.normalize_denomination(v, u) for v, u in denoms_raw]
                print(f"DEBUG: Denominations: {denominations}")
                
                if not denominations and last_denomination:
                    denominations = [last_denomination]
                    print(f"DEBUG: Using last denomination: {last_denomination}")
                elif denominations:
                    last_denomination = denominations[0]
                
                if not denominations:
                    special_match = re.search(
                        r'([E]\d+[a-zA-Z]*(?:\s+[E]\d+[a-zA-Z]*)*)\s+(\d+)\s+',
                        row_text, re.IGNORECASE
                    )
                    if special_match:
                        denom_value = special_match.group(2)
                        denominations = [self.normalize_denomination(denom_value, 'c')]
                        last_denomination = denominations[0]
                
                colors = []
                for entry in entries:
                    color = self.extract_stamp_color(row_text, entry['catalog_number'])
                    colors.append(color)
                
                attributes = self.extract_attributes(row_text)
                
                quantity = None
                qty_match = self.QUANTITY.search(row_text)
                if qty_match:
                    raw_quantity = qty_match.group(1) or qty_match.group(2)
                    quantity = raw_quantity.replace(',', '')
                    print(f"DEBUG: Quantity: {quantity}")
                
                for i, entry in enumerate(entries):
                    cat_num = entry['catalog_number']
                    node_type = entry['node_type']
                    sub_type = entry['sub_type']
                    base_stamp = entry['base_stamp']
                    
                    final_attributes = attributes.copy()
                    if surcharge_info:
                        final_attributes.append(f'surcharge_{surcharge_info}')
                    
                    if node_type == 'stamp' and cat_num in self.stamps_seen:
                        print(f"DEBUG: Skipping duplicate stamp {cat_num}")
                        continue
                    
                    node = PhilatelicNode(
                        catalog_number=cat_num,
                        node_type=node_type,
                        sub_type=sub_type,
                        denomination=denominations[0] if denominations else None,
                        color=colors[i] if i < len(colors) else (colors[0] if colors else None),
                        year=self.current_year,
                        issue_name=self.current_issue,
                        quantity=quantity if node_type == 'stamp' else None,
                        page_number=page_number,
                        reading_order=reading_order,
                        raw_text=row_text,
                        context_before=context[-3:],
                        attributes=final_attributes,
                        base_stamp=base_stamp
                    )
                    nodes.append(node)
                    print(f"DEBUG: Created {cat_num} - {node.denomination} {node.color}")
                    
                    if node_type == 'stamp' and self.current_section and 'regular' in self.current_section.lower():
                        self.stamps_seen.add(cat_num)
                        print(f"DEBUG: Added {cat_num} to stamps_seen")
            
        return nodes
class PhilatelicExtractor:
    """Main extractor with expert Costa Rica knowledge"""
    
    def __init__(self, context_window: Tuple[int, int] = (-3, 1)):
        self.context_window = context_window
        self.parser = CostaRicaCatalogParser()
    
    def get_context(self, elements: List[Dict], index: int) -> Tuple[List[str], List[str]]:
        """Get context around element"""
        start = max(0, index + self.context_window[0])
        end_after = min(len(elements), index + self.context_window[1] + 1)
        
        context_before = []
        for i in range(start, index):
            text = elements[i].get('text', '')
            if text and elements[i].get('label') not in ['header', 'foot', 'fig']:
                context_before.append(text)
        
        context_after = []
        for i in range(index + 1, end_after):
            text = elements[i].get('text', '')
            if text and elements[i].get('label') not in ['header', 'foot', 'fig']:
                context_after.append(text)
        
        return context_before, context_after
    
    def process_page(self, page_data: Dict) -> List[PhilatelicNode]:
        """Process single page"""
        elements = page_data.get('elements', [])
        page_number = page_data.get('page_number', 0)
        
        elements = sorted(elements, key=lambda x: x.get('reading_order', 0))
        
        # Reset state for new page
        # BUT keep issue_name if no new <sec> found (issues can span pages)
        self.parser.current_section = None
        self.parser.current_sub_sec = None
        self.parser.current_sub_sub_sec = None
        # Don't reset current_issue - it carries over pages
        # Don't reset current_year - it carries over
        # Don't reset stamps_seen - we need to track across pages
        
        all_nodes = []
        
        for i, element in enumerate(elements):
            context_before, context_after = self.get_context(elements, i)
            nodes = self.parser.parse_element(element, context_before, context_after, page_number)
            all_nodes.extend(nodes)
        
        return all_nodes
    
    def build_relationships(self, nodes: List[PhilatelicNode]) -> List[Dict]:
        """Build relationships between nodes"""
        relationships = []
        
        # Index nodes by catalog number
        by_catalog = {n.catalog_number: n for n in nodes}
        
        for node in nodes:
            if node.base_stamp and node.base_stamp in by_catalog:
                base = by_catalog[node.base_stamp]
                relationships.append({
                    'from': node.catalog_number,
                    'to': base.catalog_number,
                    'type': f'{node.node_type.upper()}_OF',
                    'sub_type': node.sub_type,
                    'description': f"{node.catalog_number} ({node.sub_type}) is a {node.node_type} of stamp {base.catalog_number}"
                })
        
        return relationships
    
    def process_sample_pages(self, input_path: str, start_page: int = 30, num_pages: int = 2) -> Dict:
        """Process sample pages"""
        with open(input_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        if isinstance(data, dict) and 'page_number' in data:
            pages = [data]
        elif isinstance(data, list):
            pages = data
        else:
            pages = []
        
        pages = [p for p in pages if p.get('page_number', 0) >= start_page 
                 and p.get('page_number', 0) < start_page + num_pages]
        
        all_nodes = []
        for page in pages:
            nodes = self.process_page(page)
            all_nodes.extend(nodes)
        
        # Build relationships
        relationships = self.build_relationships(all_nodes)
        
        result = {
            'total_nodes': len(all_nodes),
            'nodes_by_type': {},
            'all_nodes': [node.to_dict() for node in all_nodes],
            'relationships': relationships,
            'relationship_count': len(relationships)
        }
        
        for node in all_nodes:
            node_type = node.node_type
            if node_type not in result['nodes_by_type']:
                result['nodes_by_type'][node_type] = []
            result['nodes_by_type'][node_type].append(node.to_dict())
        
        return result
    
    def inspect_json_structure(self, input_path: str, max_pages: int = 3):
        """Inspect JSON"""
        with open(input_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        if isinstance(data, dict) and 'page_number' in data:
            pages = [data]
        elif isinstance(data, list):
            pages = data
        else:
            print("Unexpected JSON structure")
            return
        
        print(f"Total pages: {len(pages)}")
        print(f"\nInspecting first {min(max_pages, len(pages))} pages:\n")
        
        for page in pages[:max_pages]:
            print(f"Page {page.get('page_number')}:")
            elements = page.get('elements', [])
            print(f"  Elements: {len(elements)}")
            
            types = {}
            for el in elements:
                label = el.get('label', 'unknown')
                types[label] = types.get(label, 0) + 1
            
            print(f"  Types: {types}")
            
            # Show <sec> elements (issue names)
            sec_elements = [el for el in elements if el.get('label') == 'sec']
            if sec_elements:
                print(f"  Issues found:")
                for sec in sec_elements:
                    print(f"    - {sec.get('text', 'N/A')}")
            print()
    
    def process_file(self, input_path: str, output_path: str):
        """Process entire file"""
        with open(input_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        if isinstance(data, dict) and 'page_number' in data:
            pages = [data]
        elif isinstance(data, list):
            pages = data
        else:
            pages = []
        
        all_nodes = []
        # Reset parser state at start
        self.parser.current_issue = None
        self.parser.current_section = None
        self.parser.current_year = None
        self.parser.stamps_seen = set()
        
        for page in pages:
            nodes = self.process_page(page)
            all_nodes.extend(nodes)
        
        # Build relationships
        relationships = self.build_relationships(all_nodes)
        
        output = {
            'total_nodes': len(all_nodes),
            'pages_processed': len(pages),
            'nodes': [node.to_dict() for node in all_nodes],
            'relationships': relationships,
            'summary': {
                'by_type': {},
                'by_page': {},
                'by_issue': {},
                'by_denomination': {},
                'relationship_count': len(relationships)
            }
        }
        
        # Generate statistics
        for node in all_nodes:
            node_type = node.node_type
            page_num = node.page_number
            
            # By type
            output['summary']['by_type'][node_type] = \
                output['summary']['by_type'].get(node_type, 0) + 1
            
            # By page
            output['summary']['by_page'][str(page_num)] = \
                output['summary']['by_page'].get(str(page_num), 0) + 1
            
            # By issue
            if node.issue_name:
                output['summary']['by_issue'][node.issue_name] = \
                    output['summary']['by_issue'].get(node.issue_name, 0) + 1
            
            # By denomination
            if node.denomination:
                output['summary']['by_denomination'][node.denomination] = \
                    output['summary']['by_denomination'].get(node.denomination, 0) + 1
        
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(output, f, indent=2, ensure_ascii=False)
        
        return output

In [None]:
def test_denomination_extraction():
    """Test comprehensive denomination extraction"""
    
    parser = CostaRicaCatalogParser()
    
    test_cases = [
        # Centavos/céntimos
        {'text': '99 15c black', 'expected': '15c', 'note': 'Standard centavos'},
        {'text': 'DP100 5c violet', 'expected': '5c', 'note': 'Small value'},
        {'text': 'PP101 100c orange', 'expected': '100c', 'note': '100 centavos (not Colones)'},
        {'text': 'S102 15centimos black', 'expected': '15c', 'note': 'Full word "centimos"'},
        
        # Colones
        {'text': '200 1col blue', 'expected': '1col', 'note': 'One colón'},
        {'text': '201 5C green', 'expected': '5col', 'note': 'Capital C = Colones'},
        {'text': '202 2colones red', 'expected': '2col', 'note': 'Full word "colones"'},
        
        # Pesos
        {'text': '50 1p black', 'expected': '1p', 'note': '1 peso'},
        {'text': '51 10pesos violet', 'expected': '10p', 'note': 'Full word "pesos"'},
        
        # Reales with fractions
        {'text': '10 ½r black', 'expected': '0.5r', 'note': 'Half real'},
        {'text': '11 1/2r violet', 'expected': '0.5r', 'note': 'Fraction 1/2 real'},
        {'text': '12 2r green', 'expected': '2r', 'note': '2 reales'},
        {'text': '13 ¼r orange', 'expected': '0.25r', 'note': 'Quarter real'},
        
        # Edge cases
        {'text': '99 15c deep violet 1,000,000', 'expected': '15c', 'note': 'Implicit "c", quantity present'},
        {'text': 'DP99 15c black #34009', 'expected': '15c', 'note': 'With reference number'},
    ]
    
    print("="*80)
    print("TEST 1: DENOMINATION EXTRACTION")
    print("="*80)
    print()
    
    passed = 0
    failed = 0
    
    for i, test in enumerate(test_cases, 1):
        print(f"{i}. {test['text']}")
        print(f"   Note: {test['note']}")
        
        denoms_raw = parser.extract_denominations(test['text'])
        
        if denoms_raw:
            denom_normalized = parser.normalize_denomination(denoms_raw[0][0], denoms_raw[0][1])
            
            if denom_normalized == test['expected']:
                print(f"   ✅ PASS - Extracted: {denom_normalized}")
                passed += 1
            else:
                print(f"   ❌ FAIL - Expected: {test['expected']}, Got: {denom_normalized}")
                failed += 1
        else:
            print(f"   ❌ FAIL - No denomination extracted")
            failed += 1
        print()
    
    print(f"Results: {passed}/{len(test_cases)} passed\n")
    return passed, failed

def test_issue_name_tracking():
    """Test that issue names from <sec> are properly tracked"""
    
    print("="*80)
    print("TEST 2: ISSUE NAME TRACKING (from <sec> elements)")
    print("="*80)
    print()
    
    # Simulate page with multiple issues
    test_page = {
        'page_number': 30,
        'elements': [
            {'label': 'sec', 'text': 'Simon Bolivar Birthday issue', 'reading_order': 1},
            {'label': 'para', 'text': 'July 24, 1921. Decree #18', 'reading_order': 2},
            {'label': 'para', 'text': 'Proof', 'reading_order': 3},
            {'label': 'para', 'text': 'DP99 15c black', 'reading_order': 4},
            {'label': 'para', 'text': 'Regular issue', 'reading_order': 5},
            {'label': 'para', 'text': '99 15c deep violet', 'reading_order': 6},
            
            {'label': 'sec', 'text': 'Central America Independence issue', 'reading_order': 7},
            {'label': 'para', 'text': 'September 15, 1921', 'reading_order': 8},
            {'label': 'para', 'text': 'PP100 5c violet', 'reading_order': 9},
            {'label': 'para', 'text': '100 5c violet', 'reading_order': 10},
        ]
    }
    
    extractor = PhilatelicExtractor()
    nodes = extractor.process_page(test_page)
    
    print(f"Extracted {len(nodes)} nodes\n")
    
    expected_issues = {
        'DP99': 'Simon Bolivar Birthday issue',
        '99': 'Simon Bolivar Birthday issue',
        'PP100': 'Central America Independence issue',
        '100': 'Central America Independence issue'
    }
    
    passed = 0
    failed = 0
    
    for cat_num, expected_issue in expected_issues.items():
        matching = [n for n in nodes if n.catalog_number == cat_num]
        
        if matching:
            node = matching[0]
            if node.issue_name == expected_issue:
                print(f"✅ {cat_num}: '{node.issue_name}'")
                passed += 1
            else:
                print(f"❌ {cat_num}: Expected '{expected_issue}', got '{node.issue_name}'")
                failed += 1
        else:
            print(f"❌ {cat_num}: Node not found")
            failed += 1
    
    print(f"\nResults: {passed}/{len(expected_issues)} passed\n")
    return passed, failed

def test_quantity_vs_denomination():
    """Test that quantities are not confused with denominations"""
    
    print("="*80)
    print("TEST 3: QUANTITY vs DENOMINATION DISAMBIGUATION")
    print("="*80)
    print()
    
    test_cases = [
        {
            'text': '99 15c deep violet 1,000,000',
            'catalog': '99',
            'expected_denom': '15c',
            'expected_qty': '1,000,000',
            'note': 'Quantity has comma'
        },
        {
            'text': 'DP99 15c black #34009',
            'catalog': 'DP99',
            'expected_denom': '15c',
            'expected_qty': None,
            'note': 'No quantity'
        },
        {
            'text': '100 5c violet printed 500,000',
            'catalog': '100',
            'expected_denom': '5c',
            'expected_qty': '500,000',
            'note': 'Quantity with "printed" keyword'
        },
    ]
    
    parser = CostaRicaCatalogParser()
    passed = 0
    failed = 0
    
    for i, test in enumerate(test_cases, 1):
        print(f"{i}. {test['text']}")
        print(f"   Note: {test['note']}")
        
        # Extract denomination
        denoms_raw = parser.extract_denominations(test['text'])
        denom = parser.normalize_denomination(denoms_raw[0][0], denoms_raw[0][1]) if denoms_raw else None
        
        # Extract quantity
        qty_match = parser.QUANTITY.search(test['text'])
        qty = qty_match.group(1) or qty_match.group(2) if qty_match else None
        
        denom_ok = (denom == test['expected_denom'])
        qty_ok = (qty == test['expected_qty'])
        
        if denom_ok and qty_ok:
            print(f"   ✅ PASS - Denom: {denom}, Qty: {qty}")
            passed += 1
        else:
            print(f"   ❌ FAIL")
            if not denom_ok:
                print(f"      Denom: Expected {test['expected_denom']}, Got {denom}")
            if not qty_ok:
                print(f"      Qty: Expected {test['expected_qty']}, Got {qty}")
            failed += 1
        print()
    
    print(f"Results: {passed}/{len(test_cases)} passed\n")
    return passed, failed

def test_color_extraction_expert():
    """Expert-level color extraction test"""
    
    print("="*80)
    print("TEST 4: EXPERT COLOR EXTRACTION")
    print("="*80)
    print()
    
    parser = CostaRicaCatalogParser()
    
    test_cases = [
        {
            'text': 'S99 15c violet, op "specimen" in red with hole',
            'catalog': 'S99',
            'expected_color': 'violet',
            'should_have_attr': 'overprint_red',
            'note': 'Red is overprint color, not stamp color'
        },
        {
            'text': '106 1c brown, blue op inverted op',
            'catalog': '106',
            'expected_color': 'brown',
            'should_have_attr': 'overprint_blue',
            'note': 'Blue is overprint, brown is stamp'
        },
        {
            'text': 'PP99 15centimos light gray violet, imperf',
            'catalog': 'PP99',
            'expected_color': 'light gray violet',
            'note': 'Compound color with full word denomination'
        },
        {
            'text': '50 1p black on green',
            'catalog': '50',
            'expected_color': 'black',
            'note': 'Bicolor: black on green paper'
        },
    ]
    
    passed = 0
    failed = 0
    
    for i, test in enumerate(test_cases, 1):
        print(f"{i}. {test['text']}")
        print(f"   Note: {test['note']}")
        
        color = parser.extract_stamp_color(test['text'], test['catalog'])
        attrs = parser.extract_attributes(test['text'])
        
        color_ok = (color == test['expected_color'])
        attr_ok = True
        
        if 'should_have_attr' in test:
            attr_ok = test['should_have_attr'] in attrs
        
        if color_ok and attr_ok:
            print(f"   ✅ PASS - Color: {color}")
            if 'should_have_attr' in test:
                print(f"      Attribute found: {test['should_have_attr']}")
            passed += 1
        else:
            print(f"   ❌ FAIL")
            if not color_ok:
                print(f"      Expected color: {test['expected_color']}, Got: {color}")
            if not attr_ok:
                print(f"      Missing attribute: {test['should_have_attr']}")
            failed += 1
        print()
    
    print(f"Results: {passed}/{len(test_cases)} passed\n")
    return passed, failed

def test_full_integration():
    """Full integration test with real pages 30-31 from file"""
    
    print("="*80)
    print("TEST 5: FULL INTEGRATION (Real Pages 30-31 from file)")
    print("="*80)
    print()
    
    
    input_path = "./results/recognition_json/Mena 2018 CRPC .json"
    
    if not os.path.exists(input_path):
        print(f"⚠️  File not found: {input_path}")
        print("   Using minimal sample data instead\n")
        
        # Fallback to minimal sample
        sample_data = [{
            'page_number': 30,
            'elements': [
                {"label": "sec", "text": "Simon Bolivar Birthday issue", "reading_order": 8},
                {"label": "para", "text": "DP99 15c black", "reading_order": 11},
                {"label": "para", "text": "99 15c deep violet", "reading_order": 17}
            ]
        }]
    else:
        with open(input_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        # CORRECCIÓN: Extraer páginas del diccionario
        if isinstance(data, dict) and 'pages' in data:
            all_pages = data['pages']
        elif isinstance(data, list):
            all_pages = data
        else:
            print("⚠️  Unexpected JSON structure")
            all_pages = []
        
        # Filter pages 30-31
        sample_data = [p for p in all_pages if p.get('page_number') in [10, 11]]
        print(f"Loaded {len(sample_data)} pages from file\n")
        print(f"Total pages in catalog: {len(all_pages)}\n")
    
    extractor = PhilatelicExtractor()
    
    all_nodes = []
    for page in sample_data:
        nodes = extractor.process_page(page)
        all_nodes.extend(nodes)
    
    print(f"Total nodes extracted: {len(all_nodes)}")
    print(f"Expected for pages 30-31: 58-70 nodes\n")
    
    # Group by issue
    by_issue = {}
    for node in all_nodes:
        issue = node.issue_name or 'No issue'
        if issue not in by_issue:
            by_issue[issue] = []
        by_issue[issue].append(node)
    
    print("Nodes by Issue:")
    for issue, nodes in by_issue.items():
        print(f"  {issue}: {len(nodes)} nodes")
    print()
    
    # Group by type
    by_type = {}
    for node in all_nodes:
        if node.node_type not in by_type:
            by_type[node.node_type] = []
        by_type[node.node_type].append(node)
    
    print("Nodes by Type:")
    for node_type, nodes in sorted(by_type.items()):
        print(f"  {node_type:15s}: {len(nodes)} nodes")
    print()
    
    # Validation checks - GENERALIZED
    checks = []
    
    # Check 1: All nodes have issue_name
    nodes_with_issue = [n for n in all_nodes if n.issue_name]
    checks.append({
        'name': 'All nodes have issue_name',
        'passed': len(nodes_with_issue) == len(all_nodes),
        'detail': f"{len(nodes_with_issue)}/{len(all_nodes)}"
    })
    
    # Check 2: All nodes have denomination
    nodes_with_denom = [n for n in all_nodes if n.denomination]
    checks.append({
        'name': 'All nodes have denomination',
        'passed': len(nodes_with_denom) == len(all_nodes),
        'detail': f"{len(nodes_with_denom)}/{len(all_nodes)}"
    })
    
    # Check 3: Multiple issues detected (GENERALIZED)
    unique_issues = set(n.issue_name for n in all_nodes if n.issue_name)
    checks.append({
        'name': 'Multiple issues detected',
        'passed': len(unique_issues) >= 1,
        'detail': f"{len(unique_issues)} unique issues found"
    })
    
    # Check 4: Overprint colors captured as attributes (GENERALIZED)
    nodes_with_op_attrs = [n for n in all_nodes 
                          if any('overprint' in attr.lower() for attr in n.attributes)]
    checks.append({
        'name': 'Overprint colors in attributes',
        'passed': True,  # Non-critical, just informational
        'detail': f"{len(nodes_with_op_attrs)} nodes with overprint attributes"
    })
    
    # Check 5: Base stamps exist
    base_stamps = [n for n in all_nodes if n.node_type == 'stamp' and not n.base_stamp]
    checks.append({
        'name': 'Base stamps detected',
        'passed': len(base_stamps) > 0,
        'detail': f"{len(base_stamps)} base stamps"
    })
    
    # Check 6: Varieties/Proofs have base_stamp reference
    derived_nodes = [n for n in all_nodes if n.node_type in ['proof', 'variety', 'specimen']]
    with_base = [n for n in derived_nodes if n.base_stamp]
    checks.append({
        'name': 'Derived nodes have base_stamp',
        'passed': len(with_base) > 0 if derived_nodes else True,
        'detail': f"{len(with_base)}/{len(derived_nodes)} have base reference" if derived_nodes else "N/A"
    })
    
    # Check 7: Relationships exist
    relationships = extractor.build_relationships(all_nodes)
    checks.append({
        'name': 'Relationships built',
        'passed': len(relationships) > 0,
        'detail': f"{len(relationships)} relationships"
    })
    
    # Check 8: Year captured
    nodes_with_year = [n for n in all_nodes if n.year]
    checks.append({
        'name': 'Year information captured',
        'passed': len(nodes_with_year) > 0,
        'detail': f"{len(nodes_with_year)} nodes with year info"
    })
    
    # Check 9: Node count reasonable
    reasonable_count = 50 <= len(all_nodes) <= 80
    checks.append({
        'name': 'Node count within expected range',
        'passed': reasonable_count,
        'detail': f"{len(all_nodes)} nodes (expected 58-70)"
    })
    
    # Print validation results
    print("="*80)
    print("VALIDATION RESULTS")
    print("="*80)
    print()
    
    passed = sum(1 for c in checks if c['passed'])
    
    for check in checks:
        status = '✅' if check['passed'] else '❌'
        print(f"{status} {check['name']}: {check['detail']}")
    
    print(f"\nValidation: {passed}/{len(checks)} checks passed\n")
    
    # Show detailed breakdown
    print("="*80)
    print("DETAILED BREAKDOWN BY ISSUE")
    print("="*80)
    print()
    
    for issue_name, nodes in sorted(by_issue.items()):
        print(f"\n{issue_name}:")
        print(f"  Total: {len(nodes)} nodes")
        
        # Count by type within this issue
        types_in_issue = {}
        for n in nodes:
            types_in_issue[n.node_type] = types_in_issue.get(n.node_type, 0) + 1
        
        print(f"  Types: {dict(types_in_issue)}\n")
        
        # Show first 5 nodes
        for i, node in enumerate(nodes, 1): #enumerate(nodes[:5], 1)
            attrs_str = f", attrs: {node.attributes if node.attributes else '[]'}"
            refs_str = f", refs: {node.reference_numbers[:2]}" if node.reference_numbers else ""
            base_str = f" → {node.base_stamp}" if node.base_stamp else ""
            
            print(f"  {i}. {node.catalog_number} ({node.node_type}/{node.sub_type}){base_str}")
            qty_str = f", qty: {node.quantity}" if node.quantity else ""
            print(f"     {node.denomination} {node.color or 'N/A'}{qty_str}{attrs_str}{refs_str}")
        
        # if len(nodes) > 5:
        #     print(f"  ... and {len(nodes) - 5} more")
    
    # Show relationships
    if relationships:
        print("\n" + "="*80)
        print("RELATIONSHIPS")
        print("="*80)
        print()
        
        # Group relationships by type
        by_rel_type = {}
        for rel in relationships:
            rel_type = rel['type']
            if rel_type not in by_rel_type:
                by_rel_type[rel_type] = []
            by_rel_type[rel_type].append(rel)
        
        for rel_type, rels in sorted(by_rel_type.items()):
            print(f"\n{rel_type} ({len(rels)} relationships):")
            for rel in rels[:5]:
                print(f"  {rel['from']:10s} --> {rel['to']}")
            if len(rels) > 5:
                print(f"  ... and {len(rels) - 5} more")
    
    print("\n" + "="*80)
    print(f"Results: {passed}/{len(checks)} checks passed")
    print("="*80 + "\n")
    
    # DEBUG: Mostrar nodos sin denominación
    nodes_without_denom = [n for n in all_nodes if not n.denomination]
    if nodes_without_denom:
        print("\n" + "="*80)
        print("DEBUG: NODES WITHOUT DENOMINATION")
        print("="*80)
        print()
        print(f"Found {len(nodes_without_denom)} nodes without denomination:\n")
        
        for i, node in enumerate(nodes_without_denom, 1):
            print(f"{i}. Catalog: {node.catalog_number}")
            print(f"   Type: {node.node_type}/{node.sub_type}")
            print(f"   Issue: {node.issue_name}")
            print(f"   Page: {node.page_number}, Order: {node.reading_order}")
            print(f"   Raw text: {node.raw_text[:100]}...")
            print(f"   Color: {node.color}")
            print()
        
    return passed, len(checks) - passed

In [None]:
import os
import json
import matplotlib.pyplot as plt
import networkx as nx
from matplotlib.patches import FancyBboxPatch
from collections import defaultdict
import pandas as pd

def create_stamp_graph(base_stamp_node, all_nodes, relationships, output_dir="./graph_outputs"):
    """
    Create a detailed graph for a single stamp and all its relationships.
    
    Args:
        base_stamp_node: The main stamp node
        all_nodes: List of all nodes
        relationships: List of all relationships
        output_dir: Directory to save graph images
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Create directed graph
    G = nx.DiGraph()
    
    # Find all related nodes
    related_nodes = set()
    related_relationships = []
    
    # Get the catalog number of the base stamp
    base_id = base_stamp_node.catalog_number
    
    # Add base stamp to graph
    qty_display = f"\nQty: {base_stamp_node.quantity}" if base_stamp_node.quantity else ""
    G.add_node(base_id, 
            node_obj=base_stamp_node,
            node_type=base_stamp_node.node_type,
            label=f"{base_id}\n{base_stamp_node.denomination}\n{base_stamp_node.color or ''}{qty_display}".strip())
    
    # Find all relationships involving this stamp
    for rel in relationships:
        if rel['from'] == base_id or rel['to'] == base_id:
            related_relationships.append(rel)
            related_nodes.add(rel['from'])
            related_nodes.add(rel['to'])
    
    # Add related nodes to graph
    node_lookup = {n.catalog_number: n for n in all_nodes}
    
    for node_id in related_nodes:
        if node_id != base_id and node_id in node_lookup:
            node = node_lookup[node_id]
            label_parts = [node_id]
            if node.denomination:
                label_parts.append(node.denomination)
            if node.color:
                label_parts.append(node.color)
            if node.node_type != 'stamp':
                label_parts.append(f"({node.node_type})")
            
            G.add_node(node_id, 
                      node_obj=node,
                      node_type=node.node_type,
                      label='\n'.join(label_parts))
    
    # Add edges with relationship types
    for rel in related_relationships:
        G.add_edge(rel['from'], rel['to'], 
                  rel_type=rel['type'],
                  label=rel['type'].replace('_', ' ').title())
    
    # Create figure with better size
    fig, ax = plt.subplots(1, 1, figsize=(14, 10))
    
    # Define node colors by type
    node_colors = {
        'stamp': '#4CAF50',        # Green for base stamps
        'variety': '#2196F3',       # Blue for varieties
        'proof': '#FF9800',         # Orange for proofs
        'specimen': '#9C27B0',      # Purple for specimens
        'error': '#F44336',         # Red for errors
        'overprint': '#00BCD4',     # Cyan for overprints
        'surcharge': '#FFEB3B',     # Yellow for surcharges
    }
    
    # Get colors for nodes
    colors = []
    for node_id in G.nodes():
        node_type = G.nodes[node_id].get('node_type', 'stamp')
        colors.append(node_colors.get(node_type, '#9E9E9E'))
    
    # Calculate layout - hierarchical for better visualization
    if len(G.nodes()) > 1:
        # Try hierarchical layout with base stamp at center
        pos = nx.spring_layout(G, k=2, iterations=50, seed=42)
        
        # Adjust position to put base stamp at center-top
        if base_id in pos:
            # Center the base stamp
            base_pos = pos[base_id]
            pos[base_id] = (0.5, 0.9)
            
            # Arrange related nodes in a semi-circle below
            related = [n for n in G.nodes() if n != base_id]
            if related:
                angle_step = 3.14 / (len(related) + 1)
                for i, node_id in enumerate(related):
                    angle = angle_step * (i + 1)
                    radius = 0.4
                    x = 0.5 + radius * np.cos(angle + 3.14)
                    y = 0.4 + radius * np.sin(angle + 3.14) * 0.6
                    pos[node_id] = (x, y)
    else:
        pos = {base_id: (0.5, 0.5)}
    
    # Draw the graph
    nx.draw_networkx_nodes(G, pos, 
                          node_color=colors, 
                          node_size=3000,
                          alpha=0.9,
                          ax=ax)
    
    # Draw labels with better formatting
    labels = nx.get_node_attributes(G, 'label')
    nx.draw_networkx_labels(G, pos, labels, 
                           font_size=8, 
                           font_weight='bold',
                           ax=ax)
    
    # Draw edges with labels
    nx.draw_networkx_edges(G, pos, 
                          edge_color='gray',
                          arrows=True,
                          arrowsize=20,
                          arrowstyle='-|>',
                          width=2,
                          alpha=0.6,
                          ax=ax)
    
    # Draw edge labels
    edge_labels = nx.get_edge_attributes(G, 'label')
    nx.draw_networkx_edge_labels(G, pos, edge_labels, 
                                font_size=7,
                                font_color='red',
                                ax=ax)
    
    # Add title with stamp details
    title_parts = [f"Stamp Network: {base_id}"]
    if base_stamp_node.issue_name:
        title_parts.append(f"Issue: {base_stamp_node.issue_name}")
    if base_stamp_node.year:
        title_parts.append(f"Year: {base_stamp_node.year}")
    
    plt.title('\n'.join(title_parts), fontsize=14, fontweight='bold')
    
    # Add legend
    legend_elements = []
    for node_type, color in node_colors.items():
        if any(G.nodes[n].get('node_type') == node_type for n in G.nodes()):
            legend_elements.append(plt.scatter([], [], c=color, s=100, 
                                              label=node_type.title()))
    
    if legend_elements:
        plt.legend(handles=legend_elements, loc='upper left', frameon=True)
    
    # Add statistics box
    stats_text = f"Nodes: {len(G.nodes())}\nRelationships: {len(G.edges())}"
    plt.text(0.02, 0.02, stats_text, transform=ax.transAxes,
            fontsize=10, verticalalignment='bottom',
            bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
    
    plt.axis('off')
    plt.tight_layout()
    
    # Save the figure
    safe_filename = base_id.replace('/', '_').replace(' ', '_')
    output_path = os.path.join(output_dir, f"stamp_{safe_filename}.png")
    plt.savefig(output_path, dpi=150, bbox_inches='tight')
    plt.close()
    
    return output_path, G


def create_comparative_table(base_stamp_node, all_nodes, relationships, output_dir="./graph_outputs"):
    """
    Create a comparative table showing the stamp and all its variants/related items.
    """
    os.makedirs(output_dir, exist_ok=True)
    
    base_id = base_stamp_node.catalog_number
    related_data = []
    
    # Add base stamp
    related_data.append({
        'Catalog #': base_id,
        'Type': base_stamp_node.node_type,
        'Denomination': base_stamp_node.denomination,
        'Color': base_stamp_node.color or 'N/A',
        'Quantity': base_stamp_node.quantity or 'N/A',
        'Year': base_stamp_node.year or 'N/A',
        'Attributes': ', '.join(base_stamp_node.attributes) if base_stamp_node.attributes else 'N/A',
        'References': ', '.join(base_stamp_node.reference_numbers) if base_stamp_node.reference_numbers else 'N/A',
        'Relationship': 'BASE STAMP'
    })
    
    # Find all related nodes
    node_lookup = {n.catalog_number: n for n in all_nodes}
    
    for rel in relationships:
        if rel['from'] == base_id:
            if rel['to'] in node_lookup:
                node = node_lookup[rel['to']]
                related_data.append({
                    'Catalog #': node.catalog_number,
                    'Type': node.node_type,
                    'Denomination': node.denomination or 'N/A',
                    'Color': node.color or 'N/A',
                    'Year': node.year or 'N/A',
                    'Attributes': ', '.join(node.attributes) if node.attributes else 'N/A',
                    'References': ', '.join(node.reference_numbers) if node.reference_numbers else 'N/A',
                    'Relationship': rel['type'].replace('_', ' ').upper()
                })
    
    # Create DataFrame
    df = pd.DataFrame(related_data)
    
    # Create figure with table
    fig, ax = plt.subplots(figsize=(14, max(4, len(related_data) * 0.5)))
    ax.axis('tight')
    ax.axis('off')
    
    # Create table
    table = ax.table(cellText=df.values, 
                    colLabels=df.columns,
                    cellLoc='left',
                    loc='center',
                    colWidths=[0.10, 0.09, 0.11, 0.10, 0.09, 0.08, 0.17, 0.13, 0.10])
    
    table.auto_set_font_size(False)
    table.set_fontsize(9)
    table.scale(1, 1.5)
    
    # Style the table
    for i in range(len(df.columns)):
        table[(0, i)].set_facecolor('#4CAF50')
        table[(0, i)].set_text_props(weight='bold', color='white')
    
    # Highlight base stamp row
    table[(1, 0)].set_facecolor('#E8F5E9')
    for i in range(len(df.columns)):
        table[(1, i)].set_facecolor('#E8F5E9')
    
    # Add title
    title = f"Stamp Catalog Comparison: {base_id}"
    if base_stamp_node.issue_name:
        title += f"\nIssue: {base_stamp_node.issue_name}"
    plt.title(title, fontsize=12, fontweight='bold', pad=20)
    
    # Save
    safe_filename = base_id.replace('/', '_').replace(' ', '_')
    output_path = os.path.join(output_dir, f"table_{safe_filename}.png")
    plt.savefig(output_path, dpi=150, bbox_inches='tight')
    plt.close()
    
    return output_path


def enhanced_test_full_integration():
    """Enhanced version with graph visualization per stamp"""
    
    print("="*80)
    print("ENHANCED TEST WITH GRAPH VISUALIZATION")
    print("="*80)
    print()
    
    input_path = "./results/recognition_json/Mena 2018 CRPC .json"
    
    # [Previous loading code remains the same...]
    if not os.path.exists(input_path):
        print(f"⚠️  File not found: {input_path}")
        return
    
    with open(input_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    if isinstance(data, dict) and 'pages' in data:
        all_pages = data['pages']
    elif isinstance(data, list):
        all_pages = data
    else:
        print("⚠️  Unexpected JSON structure")
        return
    
    # Filter pages as before
    sample_data = [p for p in all_pages if p.get('page_number') in [8, 9]]
    
    # Process with your extractor
    extractor = PhilatelicExtractor()
    
    all_nodes = []
    for page in sample_data:
        nodes = extractor.process_page(page)
        all_nodes.extend(nodes)
    
    # Build relationships
    relationships = extractor.build_relationships(all_nodes)
    
    # Find all base stamps (stamps without base_stamp reference)
    base_stamps = [n for n in all_nodes if n.node_type == 'stamp' and not n.base_stamp]
    
    print(f"Found {len(base_stamps)} base stamps to visualize")
    print("Creating individual graphs for each stamp...\n")
    
    # Create graphs for each base stamp
    graph_paths = []
    table_paths = []
    
    for i, stamp in enumerate(base_stamps, 1):
        print(f"Processing {i}/{len(base_stamps)}: {stamp.catalog_number} - {stamp.issue_name}")
        
        # Create network graph
        graph_path, G = create_stamp_graph(stamp, all_nodes, relationships)
        graph_paths.append(graph_path)
        print(f"  ✅ Graph saved: {graph_path}")
        
        # Create comparative table
        table_path = create_comparative_table(stamp, all_nodes, relationships)
        table_paths.append(table_path)
        print(f"  ✅ Table saved: {table_path}")
        
        # Print graph statistics
        print(f"  📊 Graph stats: {len(G.nodes())} nodes, {len(G.edges())} relationships")
        print()
    
    # Create summary HTML file for easy viewing
    create_html_summary(base_stamps, graph_paths, table_paths)
    
    print("="*80)
    print("VISUALIZATION COMPLETE")
    print(f"Generated {len(graph_paths)} stamp graphs")
    print(f"Generated {len(table_paths)} comparison tables")
    print("Check ./graph_outputs/ directory for results")
    print("Open summary.html for easy navigation")
    print("="*80)


def create_html_summary(base_stamps, graph_paths, table_paths, output_dir="./graph_outputs"):
    """Create an HTML file to easily view all generated graphs"""
    
    html_content = """
    <!DOCTYPE html>
    <html>
    <head>
        <title>Philatelic Catalog Verification</title>
        <style>
            body { font-family: Arial, sans-serif; margin: 20px; }
            h1 { color: #333; }
            .stamp-section { 
                border: 2px solid #ddd; 
                margin: 20px 0; 
                padding: 15px; 
                border-radius: 5px;
            }
            .stamp-header { 
                background: #f5f5f5; 
                padding: 10px; 
                margin: -15px -15px 15px -15px;
                border-radius: 3px 3px 0 0;
            }
            .images { display: flex; gap: 20px; }
            .image-container { flex: 1; text-align: center; }
            img { max-width: 100%; border: 1px solid #ccc; }
            .navigation { 
                position: fixed; 
                right: 20px; 
                top: 20px; 
                background: white; 
                border: 1px solid #ddd;
                padding: 10px;
                max-height: 80vh;
                overflow-y: auto;
            }
        </style>
    </head>
    <body>
        <h1>Philatelic Catalog Verification Graphs</h1>
        <div class="navigation">
            <h3>Quick Navigation</h3>
            <ul>
    """
    
    # Add navigation links
    for i, stamp in enumerate(base_stamps):
        safe_id = stamp.catalog_number.replace('/', '_').replace(' ', '_')
        html_content += f'<li><a href="#{safe_id}">{stamp.catalog_number}</a></li>\n'
    
    html_content += """
            </ul>
        </div>
        <div style="margin-right: 200px;">
    """
    
    # Add stamp sections
    for i, stamp in enumerate(base_stamps):
        safe_id = stamp.catalog_number.replace('/', '_').replace(' ', '_')
        graph_filename = os.path.basename(graph_paths[i])
        table_filename = os.path.basename(table_paths[i])
        
        html_content += f"""
        <div class="stamp-section" id="{safe_id}">
            <div class="stamp-header">
                <h2>{stamp.catalog_number}</h2>
                <p><strong>Issue:</strong> {stamp.issue_name or 'N/A'}</p>
                <p><strong>Denomination:</strong> {stamp.denomination or 'N/A'}</p>
                <p><strong>Color:</strong> {stamp.color or 'N/A'}</p>
                <p><strong>Year:</strong> {stamp.year or 'N/A'}</p>
            </div>
            <div class="images">
                <div class="image-container">
                    <h3>Relationship Graph</h3>
                    <img src="{graph_filename}" alt="Graph for {stamp.catalog_number}">
                </div>
                <div class="image-container">
                    <h3>Comparison Table</h3>
                    <img src="{table_filename}" alt="Table for {stamp.catalog_number}">
                </div>
            </div>
        </div>
        """
    
    html_content += """
        </div>
    </body>
    </html>
    """
    
    # Save HTML file
    html_path = os.path.join(output_dir, "summary.html")
    with open(html_path, 'w', encoding='utf-8') as f:
        f.write(html_content)
    
    print(f"✅ HTML summary created: {html_path}")


# Additional utility function for verification
def export_verification_csv(all_nodes, relationships, output_path="./graph_outputs/verification.csv"):
    """Export all data to CSV for manual verification against catalog"""
    
    data = []
    for node in all_nodes:
        # Find relationships for this node
        related_to = []
        for rel in relationships:
            if rel['from'] == node.catalog_number:
                related_to.append(f"{rel['to']} ({rel['type']})")
        
        data.append({
            'Catalog_Number': node.catalog_number,
            'Type': node.node_type,
            'SubType': node.sub_type,
            'Issue': node.issue_name,
            'Year': node.year,
            'Denomination': node.denomination,
            'Color': node.color,
            'Attributes': ', '.join(node.attributes) if node.attributes else '',
            'References': ', '.join(node.reference_numbers) if node.reference_numbers else '',
            'Base_Stamp': node.base_stamp or '',
            'Related_To': '; '.join(related_to),
            'Page': node.page_number,
            'Reading_Order': node.reading_order
        })
    
    df = pd.DataFrame(data)
    df.to_csv(output_path, index=False)
    print(f"✅ Verification CSV exported: {output_path}")
    
    return df

In [None]:
"""Run all expert validation tests"""
    
print("\n")
print("╔" + "="*78 + "╗")
print("║" + " "*15 + "EXPERT VALIDATION TEST SUITE - COSTA RICA" + " "*22 + "║")
print("║" + " "*22 + "Philately & Regex Expert Edition" + " "*24 + "║")
print("╚" + "="*78 + "╝")
print()

results = []

# Test 1: Denominations
p1, f1 = test_denomination_extraction()
results.append(('Denomination Extraction', p1, f1))

# Test 2: Issue names
p2, f2 = test_issue_name_tracking()
results.append(('Issue Name Tracking', p2, f2))

# Test 3: Quantity vs Denomination
p3, f3 = test_quantity_vs_denomination()
results.append(('Quantity vs Denomination', p3, f3))

# Test 4: Color extraction
p4, f4 = test_color_extraction_expert()
results.append(('Expert Color Extraction', p4, f4))

# Test 5: Full integration
p5, f5 = test_full_integration()
results.append(('Full Integration', p5, f5))

# Final summary
print("\n" + "="*80)
print("FINAL SUMMARY")
print("="*80)
print()

total_passed = sum(r[1] for r in results)
total_tests = sum(r[1] + r[2] for r in results)

for name, passed, failed in results:
    total = passed + failed
    pct = (passed / total * 100) if total > 0 else 0
    status = '✅' if failed == 0 else '⚠️ '
    print(f"{status} {name:30s}: {passed:2d}/{total:2d} ({pct:5.1f}%)")

print(f"\n{'='*80}")
print(f"OVERALL: {total_passed}/{total_tests} tests passed ({total_passed/total_tests*100:.1f}%)")
print(f"{'='*80}\n")

if total_passed == total_tests:
    print("🎉 ALL TESTS PASSED!")
    print("\n✅ El extractor está listo para procesar el catálogo completo de Costa Rica")
    print("✅ Maneja todas las denominaciones: centavos, colones, pesos, reales")
    print("✅ Extrae issue names correctamente de elementos <sec>")
    print("✅ Distingue cantidades de denominaciones")
    print("✅ Extrae colores sin confundirlos con overprints")
    print("\n🚀 Próximo paso: Ejecutar en páginas 30-31 completas")
else:
    print(f"⚠️  {total_tests - total_passed} tests fallaron")
    print("Revisa los detalles arriba para corregir los problemas.")

In [None]:
enhanced_test_full_integration()

## Get the Catalogues with Landing AI

In [None]:
import json
from landingai_ade import LandingAIADE
# Load environment variables 
from dotenv import load_dotenv
load_dotenv()

In [None]:
# PDF PATH
pdf_path = "./pdfs/Catalogues/"
pdf_file_name = "Scott CR 2024 18-34"

# Parse the document
response = LandingAIADE().parse(document_url=pdf_path+pdf_file_name+".pdf",model="dpt-2-latest")


In [None]:
# Print the results
print("Extracted Markdown:")
print(response.markdown)
print("Extracted Chunks:")
print(response.chunks)

# Save Markdown to a file
if response.markdown:
    with open(f'results/parsed_catalogues/{pdf_file_name}.md', 'w', encoding='utf-8') as f:
        f.write(response.markdown)
    print("\nMarkdown content saved to a Markdown file.")
else:
    print("No 'markdown' field found in the response")
    
# Save Chunks to a JSON file
if response.chunks:
    # Convertir chunks a diccionarios para serialización JSON
    chunks_data = [chunk.model_dump() for chunk in response.chunks]
    
    with open(f'results/parsed_catalogues/{pdf_file_name}_chunks.json', 'w', encoding='utf-8') as f:
        json.dump(chunks_data, f, ensure_ascii=False, indent=2)
    print(f"\n{len(chunks_data)} chunks saved to JSON file.")
else:
    print("No 'chunks' field found in the response")

In [None]:
def _clean_chunk_text(text: str) -> str:
    """Clean markdown and formatting artifacts"""
    # Remove anchor tags
    text = re.sub(r'<a id=[\'"][^\'"]+[\'"]></a>\n*', '', text)
    # Remove figure markup but keep content
    #text = re.sub(r'<::(.*?)::>', r'\1', text, flags=re.DOTALL)
    # Clean excessive whitespace
    text = re.sub(r'\n{3,}', '\n\n', text)
    return text.strip()

In [None]:
group_chunks = []
with open(f'results/parsed_catalogues/{pdf_file_name}_chunks.json', 'r', encoding='utf-8') as f:
    chunks_data = json.load(f)
    print(f"Se cargaron {len(chunks_data)} chunks desde el archivo JSON.\n")
    
    # Iterar sobre cada chunk
    temp_merge = []
    for i, chunk in enumerate(chunks_data):
        print(f"--- Chunk {i + 1} ---")
        print(f"Tipo: {chunk.get('type', 'N/A')}")
        print(f"Texto: {chunk.get('markdown', 'N/A')[:100]}...")  # Primeros 100 caracteres
        print(f"Página: {chunk.get('grounding', 'N/A')['page']}")
        
        print()  # Línea en blanco entre chunks
        temp_merge.append(chunk)
        # Ejemplo: procesar solo chunks de texto
        if chunk.get('type') == 'text':
            # Tu lógica aquí            
            group_chunks.append(temp_merge)
            temp_merge = []
            continue

In [None]:
# from icecream import ic
# for i,group_chunk in enumerate(group_chunks):
#     if len(group_chunk) == 1:
#         print("Group Number ",i)
#         for chunk in group_chunk:
#             print("SOLUTION")
#             chunk_solution = group_chunks[i-1][-1]
#             print(f"Tipo: {chunk_solution.get('type', 'N/A')}")
#             print(f"Texto: {_clean_chunk_text(chunk_solution.get('markdown', 'N/A'))}")
#             print(f"Página: {chunk_solution.get('grounding', 'N/A')['page']}")
#             print("-------END SOLUTION----------")
#             print()                
#             print(f"Tipo: {chunk.get('type', 'N/A')}")
#             print(f"Texto: {_clean_chunk_text(chunk.get('markdown', 'N/A'))}")
#             print(f"Página: {chunk.get('grounding', 'N/A')['page']}")
#             print("---------------")
#         print("****************************************************")

In [None]:
from icecream import ic

# Crear el nuevo arreglo fusionado
group_chunks_merged = []

for i, group_chunk in enumerate(group_chunks):
    # Si es el primer grupo, simplemente lo agregamos
    if i == 0:
        group_chunks_merged.append(group_chunk[:])  # Copia del grupo
    # Si el grupo actual tiene solo 1 elemento
    elif len(group_chunk) == 1:
        # Obtenemos el grupo anterior del nuevo arreglo (ya procesado)
        previous_group = group_chunks_merged[-1]
        
        # Si el grupo anterior tiene elementos
        if len(previous_group) > 0:
            # Extraemos el último elemento del grupo anterior
            chunk_solution = previous_group[-1]
            
            # Removemos ese elemento del grupo anterior en el nuevo arreglo
            group_chunks_merged[-1] = previous_group[:-1]
            
            # Creamos el nuevo grupo fusionado: [chunk_solution, chunk_actual]
            merged_group = [chunk_solution, group_chunk[0]]
            group_chunks_merged.append(merged_group)
            
            print(f"✓ Grupo {i} fusionado con último elemento del grupo {i-1}")
        else:
            # Si el grupo anterior ya está vacío, solo agregamos el actual
            group_chunks_merged.append(group_chunk[:])
    else:
        # Si tiene más de 1 elemento, lo agregamos tal cual
        group_chunks_merged.append(group_chunk[:])

# Limpiamos grupos vacíos si existen
group_chunks_merged = [group for group in group_chunks_merged if len(group) > 0]


In [None]:
from icecream import ic
for i,group_chunk in enumerate(group_chunks_merged[0:30]):    
    print("Group Number ",i)
    for chunk in group_chunk:
        print(f"Tipo: {chunk.get('type', 'N/A')}")
        print(f"Texto: {_clean_chunk_text(chunk.get('markdown', 'N/A'))}")
        print(f"Página: {chunk.get('grounding', 'N/A')['page']}")
        print("---------------")
    print("****************************************************")

In [None]:
len(group_chunks_merged)

## Parseando el Catalogo Scott con LLM 

In [None]:
# Import your stamp models (assuming they're in a file called 'kg_pydantic.py')
# from stamp_models import ScottEntry, ScottNumber, Denomination, ColorDescription, Perforation, MonetaryValue, PrintingMethod, StampType, PaperType
from kg_pydantic import *
# Load environment variables 
from dotenv import load_dotenv
load_dotenv()

In [None]:
"""
Simplified Scott Catalog Parser with Direct Examples
This version uses a more straightforward approach with explicit examples
"""

import os
import json
import re
from typing import List, Dict, Any, Optional
from decimal import Decimal
from datetime import datetime

from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate, FewShotChatMessagePromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain_community.callbacks import get_openai_callback



class SimpleScottParser:
    """Simplified parser that works with direct examples"""
    
    def __init__(self, openai_api_key: str, model_name: str = "gpt-4o-mini"):
        self.llm = ChatOpenAI(
            temperature=0.0,
            model_name=model_name,
            openai_api_key=openai_api_key,
            max_tokens=4000
        )
        
        # Use JSON output parser instead of Pydantic
        self.output_parser = JsonOutputParser()
        
        # Create the chain
        self.chain = self._create_chain()
    
    def parse_chunk(self, text: str) -> Dict[str, Any]:
        """Parse a chunk of catalog text"""
        
        # Clean the text
        # text = re.sub(r'<a id=[\'"][^\'"]+[\'"]></a>\n*', '', text)
        # text = re.sub(r'<::(.*?)::>', r'\1', text, flags=re.DOTALL)
        # text = text.strip()
        
        try:
            with get_openai_callback() as cb:
                result = self.chain.invoke({"input": text})
                print("Prompt tokens:", cb.prompt_tokens)
                print("Completion tokens:", cb.completion_tokens)
                print("Total tokens:", cb.total_tokens)
                print("Costo (USD):", cb.total_cost)
                return result
        except Exception as e:
            print(f"Parsing error: {e}")
            return {"stamps": [], "error": str(e)}
    
    def _create_chain(self):
        """Create the parsing chain with explicit examples"""
        
        # Define examples with exact input/output
        examples = [
            {
                "input": """1889 Black Overprint
23 A8 1c rose     5.00 3.00
24 A9 5c brown    7.00 3.00
Vertical and inverted overprints are fakes.""",
                "output": json.dumps({
                    "stamps": [
                        {
                            "scott_number": "23",
                            "illustration": "A8",
                            "denomination": "1c",
                            "color": "rose",
                            "mint_value": 5.00,
                            "used_value": 3.00
                        },
                        {
                            "scott_number": "24",
                            "illustration": "A9",
                            "denomination": "5c",
                            "color": "brown",
                            "mint_value": 7.00,
                            "used_value": 3.00
                        }
                    ],
                    "header": "1889 Black Overprint",
                    "notes": ["Vertical and inverted overprints are fakes."]
                }, indent=2)
            },
            {
                "input": """1901, Jan.                                Perf. 12-15½
45 A30    1c green & blk                  3.25    .30
a.        Horiz. pair, imperf. btwn.      150.00
46 A31    2c ver & blk                    1.25    .30""",
                "output": json.dumps({
                    "stamps": [
                        {
                            "scott_number": "45",
                            "illustration": "A30",
                            "denomination": "1c",
                            "color": "green & blk",
                            "mint_value": 3.25,
                            "used_value": 0.30,
                            "perforation": "12-15½"
                        },
                        {
                            "scott_number": "45a",
                            "variety_of": "45",
                            "description": "Horiz. pair, imperf. btwn.",
                            "mint_value": 150.00
                        },
                        {
                            "scott_number": "46",
                            "illustration": "A31",
                            "denomination": "2c",
                            "color": "ver & blk",
                            "mint_value": 1.25,
                            "used_value": 0.30,
                            "perforation": "12-15½"
                        }
                    ],
                    "header": "1901, Jan.",
                    "perforation": "Perf. 12-15½"
                }, indent=2)
            },
            {
                "input": """<::A dark-colored postage stamp with a portrait of a man with a mustache in the center. The stamp has "CORREOS" and "COSTA RICA" written in a circular pattern around the portrait. The number "5" is visible in the top left and bottom right corners, and "CENTAVOS" is at the bottom. The stamp has perforated edges. : postage stamp::>
President Bernardo Soto Alfaro — A7""",
                "output": json.dumps({
                    "stamps": [],
                    "illustrations": [
                        {
                            "illustration_number": "A7",
                            "design_name": "President Bernardo Soto Alfaro",
                            "design_description": "A dark-colored postage stamp with a portrait of a man with a mustache in the center. The stamp has \"CORREOS\" and \"COSTA RICA\" written in a circular pattern around the portrait. The number \"5\" is visible in the top left and bottom right corners, and \"CENTAVOS\" is at the bottom. The stamp has perforated edges.",
                            "denomination": "5 CENTAVOS"
                        }
                    ]
                }, indent=2)
            },
            {
                "input": """<::A collection of six postage stamps, each featuring a portrait of President Soto Alfaro. The stamps are arranged in two columns. Top row: - Left stamp (A10): A brown stamp with a portrait of a man, labeled "COSTA RICA" at the top and "1 CENTAVO" at the bottom, with the number "1" in the upper corners. - Right stamp (A11): A greenish-blue stamp with a portrait of a man, labeled "COSTA RICA" at the top and "2 CENTAVOS" at the bottom, with the number "2" in the upper corners. Middle row: - Left stamp (A12): A reddish-orange stamp with a portrait of a man, labeled "COSTA RICA" at the top and "5 CENTAVOS" at the bottom, with the number "5" in the upper corners. - Right stamp (A13): A reddish-orange stamp with a portrait of a man, labeled "COSTA RICA" at the top and "10 CENTAVOS" at the bottom, with the number "10" in the upper corners. Bottom row: - Left stamp (A14): A green stamp with a portrait of a man, labeled "COSTA RICA" at the top and "20 CENTAVOS" at the bottom, with the number "20" in the upper corners. - Right stamp (A15): A reddish-orange stamp with a portrait of a man, labeled "COSTA RICA" at the top and "50 CENTAVOS" at the bottom, with the number "50" in the upper corners. : figure::>""",
                "output": json.dumps({
                    "stamps": [],
                    "illustrations": [
                        {
                            "illustration_number": "A10",
                            "design_description": "A brown stamp with a portrait of President Soto Alfaro",
                            "denomination": "1 CENTAVO",
                            "color": "brown"
                        },
                        {
                            "illustration_number": "A11",
                            "design_description": "A greenish-blue stamp with a portrait of President Soto Alfaro",
                            "denomination": "2 CENTAVOS",
                            "color": "greenish-blue"
                        },
                        {
                            "illustration_number": "A12",
                            "design_description": "A reddish-orange stamp with a portrait of President Soto Alfaro",
                            "denomination": "5 CENTAVOS",
                            "color": "reddish-orange"
                        },
                        {
                            "illustration_number": "A13",
                            "design_description": "A reddish-orange stamp with a portrait of President Soto Alfaro",
                            "denomination": "10 CENTAVOS",
                            "color": "reddish-orange"
                        },
                        {
                            "illustration_number": "A14",
                            "design_description": "A green stamp with a portrait of President Soto Alfaro",
                            "denomination": "20 CENTAVOS",
                            "color": "green"
                        },
                        {
                            "illustration_number": "A15",
                            "design_description": "A reddish-orange stamp with a portrait of President Soto Alfaro",
                            "denomination": "50 CENTAVOS",
                            "color": "reddish-orange"
                        }
                    ]
                }, indent=2)
            }
        ]
        
        # Create the few-shot prompt
        example_prompt = ChatPromptTemplate.from_messages([
            ("human", "{input}"),
            ("ai", "{output}")
        ])
        
        few_shot_prompt = FewShotChatMessagePromptTemplate(
            example_prompt=example_prompt,
            examples=examples,
        )
        
        # Final prompt
        final_prompt = ChatPromptTemplate.from_messages([
            ("system", """
    You are a Scott catalog parser. Extract stamp information and illustration descriptions from catalog text.

    CONTENT TYPES:
    1. STAMP ENTRIES: Lines starting with numbers (45, 46, 47) containing catalog data
    2. VARIETIES: Lines starting with letters (a., b., c.) that modify the stamp above
    3. ILLUSTRATIONS: Text within <::...::> describes stamp images, often followed by illustration numbers (A7, A10, etc.)

    RULES FOR STAMPS:
    - Main stamps start with numbers
    - Format: [number] [illustration] [denomination] [color] [mint_price] [used_price]
    - Multi-line entries continue from above
    - Prices: "3.25 .30" = mint $3.25, used $0.30
    - Dash (—) = no price

    RULES FOR ILLUSTRATIONS:
    - Text between <:: and ::> is a design description
    - Look for illustration numbers (A10, A11) within or after the description
    - Extract denomination and color when mentioned
    - May describe single or multiple stamps

    EXAMPLE COLOR ABBREVIATIONS:
    blk=black, grn=green, ver=vermillion, lil=lilac, ol=olive, bis=bistre, car=carmine, yel=yellow, brn=brown, dk=dark

    ALWAYS return valid JSON with the keys: "stamps", "illustrations", "header" and "notes" .
    Example of JSON for return:

    "stamps": [],        // If there are stamps if not empty
    "illustrations": [], // If there are illustrations if not empty
    "header": "1901, Jan.",  //Always if is possible 
    "notes": ["...."] //Always if there are notes if not empty


"""),
            few_shot_prompt,
            ("human", "{input}")
        ])
        
        return final_prompt | self.llm | self.output_parser
    
    def parse_and_display(self, text: str):
        """Parse and display results in a readable format"""
        
        # print("INPUT TEXT:")
        # print("-" * 60)
        # print(text[:500] + "..." if len(text) > 500 else text)
        # print("-" * 60)
        
        result = self.parse_chunk(text)
        print(result)
                
        print("\nPARSING RESULTS:")
        print("-" * 60)
        
        if "error" in result:
            print(f"ERROR: {result['error']}")
            return result
        
        # Display stamps
        stamps = result.get("stamps", [])
        if stamps:
            print(f"Found {len(stamps)} stamps:\n")
            
            for stamp in stamps:
                if stamp.get("variety_of"):
                    print(f"  └─ #{stamp['scott_number']}: {stamp.get('description', 'Variety')}")
                    if stamp.get('mint_value'):
                        print(f"      Value: ${stamp['mint_value']}")
                else:
                    print(f"#{stamp['scott_number']} ({stamp.get('illustration', 'N/A')}): "
                         f"{stamp.get('denomination', '')} {stamp.get('color', '')}")
                    if stamp.get('mint_value'):
                        print(f"  Values: ${stamp['mint_value']} mint / "
                             f"${stamp.get('used_value', 'N/A')} used")
        
        # Display illustrations
        illustrations = result.get("illustrations", [])
        if illustrations:
            print(f"\nFound {len(illustrations)} illustration descriptions:\n")
            
            for illus in illustrations:
                print(f"Illustration {illus['illustration_number']}:")
                if illus.get('design_name'):
                    print(f"  Name: {illus['design_name']}")
                if illus.get('denomination'):
                    print(f"  Denomination: {illus['denomination']}")
                if illus.get('color'):
                    print(f"  Color: {illus['color']}")
                desc = illus.get('design_description', '')
                if desc:
                    # Truncate long descriptions for display
                    if len(desc) > 100:
                        print(f"  Description: {desc[:100]}...")
                    else:
                        print(f"  Description: {desc}")
        
        # Display notes
        if result.get("notes"):
            print("\nNOTES:")
            for note in result["notes"]:
                print(f"  • {note}")
        
        # Display header info
        if result.get("header"):
            print(f"\nHEADER: {result['header']}")
        if result.get("perforation"):
            print(f"PERFORATION: {result['perforation']}")
        
        print("-" * 60)
        return result


# Test function
def test_parser():
    """Test the parser with your actual chunk"""
    
    # Your actual chunk text
    chunk_text = """1881-82
Red or Black Surcharge
7 A1(a) 1c on ½r ('82) 3.00 6.00
a. On No. 1a 15.00 -
8 A1(b) 1c on ½r ('82) 18.00 30.00
9 A1(c) 2c on ½r, #1a 3.00 2.75
a. On No. 1 8.00
12 A1(c) 5c on ½r 15.00
13 A1(d) 5c on ½r ('82) 35.00
14 A1(d) 10c on 2r (Bk)
('82) 72.50 -
15 A1(e) 20c on 4r ('82) 300.00 -

Overprints with different fonts and "OFICIAL" were never placed in use, and are said to have been surcharged to a dealer's order. The ½r surcharged "DOS CTS" is not a postage stamp. It probably is an essay.
Postally used examples of Nos. 7-15 are rare. Nos. 13-15 exist with a favor cancel having a hyphen between "San" and "Jose." Values same as unused. Fake cancellations exist.
Counterfeits exist of surcharges on Nos. 7-15.
---------------
 stamp of Gen. Prospero Fernández : figure

Gen. Prospero
Fernández - A6

1883, Jan. 1

| | | | | |
|---|---|---|---|---|
| 16 | A6 | 1c green | 3.00 | 1.50 |
| 17 | A6 | 2c carmine | 3.25 | 1.50 |
| 18 | A6 | 5c blue violet | 32.50 | 2.00 |
| 19 | A6 | 10c orange | 150.00 | 12.00 |
| 20 | A6 | 40c blue | 3.00 | 3.00 |
| Nos. 16-20 (5) | | | 191.75 | 20.00 |

Unused examples of 40c usually lack gum.
For overprints see Nos. O1-O20, O24,
Guanacaste 1-38, 44.
    
"""
    
    # Initialize parser
    parser = SimpleScottParser(
        openai_api_key=os.getenv("OPENAI_API_KEY", "your-api-key"),
        model_name="gpt-4o-mini"
    )
    
    # Parse and display
    result = parser.parse_and_display(chunk_text)
    
    # Save results
    with open("scott_parse_results.json", "w") as f:
        json.dump(result, f, indent=2)
    
    print(f"\nResults saved to scott_parse_results.json")
    
    # Summary
    stamps = result.get("stamps", [])
    main_stamps = [s for s in stamps if not s.get("variety_of")]
    varieties = [s for s in stamps if s.get("variety_of")]
    
    
    print(f"\nSUMMARY:")
    print(f"  Total entries: {len(stamps)}")
    print(f"  Main stamps: {len(main_stamps)}")
    print(f"  Varieties: {len(varieties)}")
    
    return result


# if __name__ == "__main__":
#     # Run the test
#     test_parser()

In [None]:
len(group_chunks_merged)

In [None]:
group_chunk = group_chunks_merged[592]
group_text = ""
for chunk in group_chunk:
        # print(f"Tipo: {chunk.get('type', 'N/A')}")
        # print(f"Texto: {_clean_chunk_text(chunk.get('markdown', 'N/A'))}")
        # print(f"Página: {chunk.get('grounding', 'N/A')['page']}")
        # print("---------------")
        group_text += _clean_chunk_text(chunk.get('markdown', 'N/A'))
print(group_text)        


In [None]:
parser = SimpleScottParser(
openai_api_key=os.getenv("OPENAI_API_KEY", "your-api-key"),
model_name="gpt-4o-mini"
)

# Parse and display
result = parser.parse_and_display(group_text)

# Save results
with open("scott_parse_results_1-17.json", "w") as f:
    json.dump(result, f, indent=2)

print(f"\nResults saved to scott_parse_results.json")

# Summary
stamps = result.get("stamps", [])
main_stamps = [s for s in stamps if not s.get("variety_of")]
varieties = [s for s in stamps if s.get("variety_of")]
illustrations = result.get("illustrations", [])

print(f"\nSUMMARY:")
print(f"  Total entries: {len(stamps)}")
print(f"  Main stamps: {len(main_stamps)}")
print(f"  Varieties: {len(varieties)}")
print(f"  Illustrations: {len(illustrations)}")

In [None]:
results = []
error_groups = []

In [None]:
import os, json, time, datetime, traceback
from tqdm import tqdm


parser = SimpleScottParser(
    openai_api_key=os.getenv("OPENAI_API_KEY", "your-api-key"),
    model_name="gpt-4o-mini"
)

total = len(group_chunks_merged)
start = time.perf_counter()

start_num = 1
start_idx = start_num - 1  # = 13

remaining = len(group_chunks_merged[start_idx:])


with tqdm(total=total, desc="Parseando grupos", unit="grp") as pbar:
    for i, group_chunk in enumerate(group_chunks_merged[start_idx:], start_num):
        t0 = time.perf_counter()
        try:
            group_text = "".join(_clean_chunk_text(ch.get('markdown', 'N/A')) for ch in group_chunk)
            result = parser.parse_and_display(group_text)
            results.append(result)

        except Exception as e:
            # Guarda índice, error y (opcional) un recorte del texto para depurar
            error_groups.append({
                "group_number": i,
                "error": str(e),
                "traceback": traceback.format_exc()
                # Si quieres: "sample": group_text[:500] if 'group_text' in locals() else ""
            })
        finally:
            # Actualiza métricas/ETA y la barra aunque haya fallo
            iter_sec = time.perf_counter() - t0
            elapsed = time.perf_counter() - start
            done = (i - start_num + 1)  # iteraciones totales (éxito+fallo) desde que empezaste
            avg = elapsed / done
            remaining_sec = avg * (remaining - done)
            eta = datetime.timedelta(seconds=max(0, int(remaining_sec)))

            pbar.set_postfix(iter_s=f"{iter_sec:.2f}", avg_s=f"{avg:.2f}", eta=str(eta))
            pbar.update(1)




In [None]:
# Guardar
with open("results/parsed_catalogues/scott_parse_results_1-17.json", "w", encoding="utf-8") as f:
    json.dump(results, f, indent=2, ensure_ascii=False)

print(f"\nResults saved to scott_parse_results_1-17.json")
print(f"Tiempo total: {datetime.timedelta(seconds=int(time.perf_counter()-start))}")

### Codigo en Batches

In [None]:
import os, json, time, datetime
from time import sleep
from tqdm import tqdm
from itertools import islice
from langchain_community.callbacks.manager import get_openai_callback

def chunked(it, size):
    it = iter(it)
    while True:
        batch = list(islice(it, size))
        if not batch:
            break
        yield batch

# --- Preparación de entradas ---
inputs = []
start_num = 1
start_idx = start_num - 1
for i, group_chunk in enumerate(group_chunks_merged[start_idx:], start_num):
    try:
        group_text = "".join(_clean_chunk_text(ch.get('markdown', 'N/A')) for ch in group_chunk)
        inputs.append({"i": i, "input": group_text})
    except Exception as e:
        # Si incluso preparar el texto falla, lo registramos y NO lo mandamos al LLM
        # (opcional: podrías agregarlo igual y que falle abajo)
        pass

max_concurrency = 3      # ajusta según límites
subbatch_size   = 5     # tamaño de oleadas
max_retries     = 2      # reintentos por oleada

results = [None] * len(inputs)
error_groups = []

t0 = time.perf_counter()
with tqdm(total=len(inputs), desc="Parseando (batch)", unit="grp") as pbar:
    with get_openai_callback() as cb:
        base = 0
        for sub in chunked(inputs, subbatch_size):
            sub_payload = [{"input": s["input"]} for s in sub]

            # --- Llamada batch con reintentos (try/except) ---
            outs = None
            last_err = None
            for attempt in range(1, max_retries + 1):
                try:
                    outs = parser.chain.batch(
                        sub_payload,
                        config={"max_concurrency": max_concurrency},
                        return_exceptions=True  # <- errores por ítem como objetos Exception
                    )
                    break  # éxito: salimos del bucle de reintentos
                except Exception as e:
                    last_err = e
                    # Backoff exponencial simple
                    if attempt < max_retries:
                        sleep(2 ** (attempt - 1))
                    else:
                        # Si falló toda la oleada tras reintentos, marcamos todos los ítems de esta oleada como error
                        for j in range(len(sub)):
                            idx_global = base + j
                            item = inputs[idx_global]
                            error_groups.append({
                                "group_number": item["i"],
                                "error": f"BATCH_FAILURE: {type(e).__name__}: {str(e)}"
                            })
                        # avanzamos la barra igualmente
                        pbar.update(len(sub))

            if outs is None:
                # Ya registramos los errores y actualizamos pbar arriba
                base += len(sub)
                continue

            # --- Procesar salidas por ítem (try implícito con return_exceptions=True) ---
            for j, out in enumerate(outs):
                idx_global = base + j
                item = inputs[idx_global]
                if isinstance(out, Exception):
                    error_groups.append({
                        "group_number": item["i"],
                        "error": f"ITEM_FAILURE: {type(out).__name__}: {str(out)}"
                    })
                else:
                    results[idx_global] = out
                pbar.update(1)

            base += len(sub)

        print(f"\nTokens prompt: {cb.prompt_tokens} | completion: {cb.completion_tokens} | total: {cb.total_tokens}")
        print(f"Costo total (USD): {cb.total_cost:.6f}")

elapsed = time.perf_counter() - t0
print(f"Tiempo total: {datetime.timedelta(seconds=int(elapsed))}")

# --- Guardar ---
os.makedirs("results/parsed_catalogues", exist_ok=True)
ok = [r for r in results if r is not None]
with open("results/parsed_catalogues/scott_parse_results_18-34.json", "w", encoding="utf-8") as f:
    json.dump(ok, f, indent=2, ensure_ascii=False)

with open("results/parsed_catalogues/scott_parse_errors_18-34.json", "w", encoding="utf-8") as f:
    json.dump(error_groups, f, indent=2, ensure_ascii=False)

print(f"OK: {len(ok)} | Errores: {len(error_groups)}")


In [None]:
# """
# Scott Catalog Parser using LangChain + OpenAI + Pydantic
# Enhanced version with expert philatelic knowledge for Costa Rica stamps
# """

# import os
# import json
# import re
# from typing import List, Optional, Dict, Any, Union
# from decimal import Decimal
# from datetime import datetime
# from enum import Enum

# from langchain_openai import ChatOpenAI
# from langchain_core.prompts import ChatPromptTemplate
# from langchain_core.output_parsers import PydanticOutputParser
# from langchain.output_parsers import OutputFixingParser
# from pydantic import BaseModel, Field, field_validator, model_validator

# # Import your stamp models (assuming they're in a file called 'stamp_models.py')
# # from stamp_models import ScottEntry, ScottNumber, Denomination, ColorDescription, Perforation, MonetaryValue, PrintingMethod, StampType, PaperType

# # ============================================================================
# # ENHANCED PARSING MODELS
# # ============================================================================

# class ChunkType(str, Enum):
#     """Types of content chunks from ADE Parse"""
#     TEXT = "text"           # Regular text content
#     TABLE = "table"         # Tabular data
#     MARGINALIA = "marginalia"  # Side notes
#     FIGURE = "figure"       # Image descriptions
#     LOGO = "logo"          # Logo images
#     CARD = "card"          # Card elements (DPT-2)
#     ATTESTATION = "attestation"  # Attestations (DPT-2)
#     SCAN_CODE = "scan_code"  # QR/barcodes (DPT-2)


# class StampEntryRaw(BaseModel):
#     """Raw stamp entry as parsed from catalog"""
#     scott_number: str = Field(..., description="Full Scott number like '23', 'C146a', '45'")
#     illustration_number: Optional[str] = Field(None, description="Illustration like 'A8', 'A30'")
#     denomination: str = Field(..., description="Raw denomination like '1c', '5c', '1col'")
#     color: str = Field(..., description="Color description like 'green & blk'")
#     mint_value: Optional[float] = Field(None, description="Mint condition value in USD")
#     used_value: Optional[float] = Field(None, description="Used condition value in USD")
#     perforation: Optional[str] = Field(None, description="Perforation measurement")
#     year: Optional[int] = Field(None, description="Year of issue")
#     notes: Optional[str] = Field(None, description="Special notes about this stamp")
#     is_variety: bool = Field(default=False, description="Is this a variety (a, b, c suffix)")
#     variety_description: Optional[str] = Field(None, description="Description of the variety")
#     parent_scott_number: Optional[str] = Field(None, description="Main stamp number if this is a variety")


# class ScottEntryResponse(BaseModel):
#     """Response model for parsing Scott catalog entries"""
#     entries: List[StampEntryRaw] = Field(
#         default_factory=list,
#         description="List of Scott stamp entries found in the chunk"
#     )
#     has_stamp_info: bool = Field(
#         ...,
#         description="Whether this chunk contains actual stamp catalog entries"
#     )
#     chunk_type: ChunkType = Field(
#         ...,
#         description="Original chunk type from ADE Parse"
#     )
#     section_header: Optional[str] = Field(
#         None,
#         description="Section header if present (e.g., '1901, Jan.', 'Perf. 12-15½')"
#     )
#     perforation_info: Optional[str] = Field(
#         None,
#         description="Perforation specification for the section"
#     )
#     cross_references: List[str] = Field(
#         default_factory=list,
#         description="Cross-references to other catalog sections"
#     )
#     warnings: List[str] = Field(
#         default_factory=list,
#         description="Warnings or notes about forgeries, reprints, etc."
#     )
#     summary_line: Optional[str] = Field(
#         None,
#         description="Summary line like 'Nos. 45-54 (10)'"
#     )
#     parsing_notes: Optional[str] = Field(
#         None,
#         description="Notes about the parsing process"
#     )


# # ============================================================================
# # ENHANCED SCOTT CATALOG PARSER
# # ============================================================================

# class ScottCatalogExpertParser:
#     """Expert parser for Scott catalog with deep philatelic knowledge"""
    
#     def __init__(self, openai_api_key: str, model_name: str = "gpt-4o-mini"):
#         """
#         Initialize the expert parser
        
#         Args:
#             openai_api_key: OpenAI API key
#             model_name: Model to use (gpt-4o-mini, gpt-4o, etc.)
#         """
#         self.llm = ChatOpenAI(
#             temperature=0.0,  # Zero temperature for maximum consistency
#             model_name=model_name,
#             openai_api_key=openai_api_key,
#             max_tokens=2000
#         )
        
#         # Setup output parser with Pydantic
#         self.pydantic_parser = PydanticOutputParser(pydantic_object=ScottEntryResponse)
        
#         # Add fixing parser for robustness
#         self.output_parser = OutputFixingParser.from_llm(
#             parser=self.pydantic_parser,
#             llm=self.llm,
#             max_retries=2
#         )
        
#         # Create the expert prompt template
#         self.prompt = self._create_expert_prompt_template()
        
#         # Create the chain
#         self.chain = self.prompt | self.llm | self.output_parser
    
#     def _create_expert_prompt_template(self) -> ChatPromptTemplate:
#         """Create the expert philatelic prompt template"""
        
#         system_message = """You are a world-renowned philatelic expert specializing in Costa Rica stamps and the Scott catalog system. You have decades of experience parsing Scott catalog entries with perfect accuracy.

# SCOTT CATALOG ENTRY STRUCTURE:
# The standard format is: [Scott#] [Illustration#] [Denomination] [Color] [Mint$] [Used$]
# Example: "23 A8 1c rose 5.00 3.00"

# This means:
# - Scott #23
# - Uses illustration A8 (design type)
# - 1 centavo denomination
# - Rose color
# - $5.00 mint value
# - $3.00 used value

# CRITICAL PARSING RULES:

# 1. SCOTT NUMBER FORMATS:
#    - Regular: "1", "23", "146" (just numbers)
#    - Airmail: "C1", "C146" (C prefix)
#    - Semi-postal: "B1" (B prefix)
#    - Official: "O1" (O prefix)
#    - Postage Due: "J1" (J prefix)
#    - Major varieties: "16A", "23B" (capital letter suffix)
#    - Minor varieties: "146a", "23c" (lowercase letter suffix)

# 2. ILLUSTRATION NUMBERS:
#    - Format: Letter(s) + Number like "A1", "A8", "AP12", "D3"
#    - These are DESIGN TYPES, not individual stamps
#    - Multiple stamps can share the same illustration number
#    - Critical for identifying reprints and varieties

# 3. DENOMINATION PARSING:
#    - "1c" = 1 centavo
#    - "5c" = 5 centavos
#    - "10c" = 10 centavos (NOT centimos)
#    - "1r" = 1 real
#    - "1/2r" = medio real (0.5 real)
#    - "1p" = 1 peso
#    - "₡1" or "1col" = 1 colón

# 4. COLOR DESCRIPTIONS:
#    Standard Scott colors include:
#    - Basic: black, blue, brown, green, red, violet, yellow, orange
#    - Shades: deep, dark, light, pale, bright, dull
#    - Specific: carmine, vermillion, ultramarine, emerald, scarlet
#    - Compounds: "blue & red", "green and black"
#    - Complex: "carmine rose", "yellow brown", "blue green"

# 5. VALUE PARSING:
#    - Format: [mint] [used] like "5.00 3.00"
#    - Dash means no established value: "5.00 -"
#    - Values are in USD without $ symbol
#    - Handle ranges by taking first value

# 6. PERFORATION FORMATS:
#    - Simple: "Perf. 12", "Perf. 14"
#    - Compound: "Perf. 12x11½", "Perf. 14x13"
#    - Range: "Perf. 14-16"
#    - Complex: "Perf. 14-16 & Compound"
#    - Imperforate: "Imperf" or "Imperforate"

# 7. OVERPRINT SECTIONS:
#    - Headers like "1889 Black Overprint" indicate overprinted stamps
#    - These are regular stamps with additional printing
#    - Note overprint color and year
#    - Watch for warnings about forgeries

# 8. FIGURE DESCRIPTIONS:
#    - Chunks with <::...::> contain stamp image descriptions
#    - Extract illustration numbers (A10, A11, etc.)
#    - Match denominations and colors from descriptions
#    - These provide visual confirmation but aren't catalog entries themselves

# 9. TABLE FORMAT:
#    - Tables may have rowspan/colspan
#    - Item column often has Scott# and Illustration#
#    - Parse each row as separate entry

# 10. WARNINGS AND NOTES:
#    - "Vertical and inverted overprints are fakes" = forgery warning
#    - "For overprints see..." = cross-reference
#    - "Nos. 1-4 exist imperforate..." = variety note

# 11. SECTION HEADERS:
#    - Year headers: "1889", "1901"  
#    - Type headers: "Black Overprint", "Postage Due", "Air Post"
#    - Issue names: "First Issue", "Coat of Arms"

# 12. CHUNK CLASSIFICATION:
#    - stamp_listings: Contains actual catalog entries with Scott numbers
#    - overprint_section: Overprinted stamp listings
#    - figure_description: Visual descriptions of stamps
#    - table_data: Tabular stamp data
#    - notes: Explanatory text without listings
#    - cross_reference: References to other sections
#    - header: Section/year headers only
#    - logo: Publisher logos (ignore)
#    - other: Unrelated content

# {format_instructions}

# EXPERT GUIDELINES:
# - Parse EXACTLY what's in the text - no assumptions
# - Preserve all technical details precisely
# - Flag any suspicious or ambiguous entries
# - Note all warnings about forgeries or reprints
# - Extract ALL cross-references for context
# - If a chunk has a section header, always include it
# - For figure descriptions, extract illustration mappings even if no catalog entries
# - Convert fractional denominations: 1/2 = 0.5, 1/3 = 0.333, 1/4 = 0.25
# - Maintain the distinction between centavos (monetary) and centimos (never used in Costa Rica)"""
        
#         user_message = """Parse this Scott catalog chunk with expert precision:

# CHUNK CONTENT:
# {chunk_text}

# Instructions:
# 1. Identify the chunk type first
# 2. Extract all stamp entries if present
# 3. Note section headers, warnings, and cross-references
# 4. For figures, map illustration numbers to descriptions
# 5. Flag any parsing ambiguities in parsing_notes"""
        
#         return ChatPromptTemplate.from_messages([
#             ("system", system_message),
#             ("user", user_message)
#         ]).partial(format_instructions=self.pydantic_parser.get_format_instructions())
    
#     def parse_chunk(self, chunk_data: Union[str, Dict[str, Any]], 
#                    chunk_id: Optional[str] = None) -> ScottEntryResponse:
#         """
#         Parse a single chunk of catalog text
        
#         Args:
#             chunk_data: Either raw text or chunk dict from ADE Parse
#             chunk_id: Optional identifier for the chunk
            
#         Returns:
#             ScottEntryResponse with extracted stamps or metadata
#         """
#         # Extract text and metadata based on input type
#         if isinstance(chunk_data, dict):
#             chunk_text = chunk_data.get("markdown", chunk_data.get("text", ""))
#             chunk_id = chunk_id or chunk_data.get("id")
#             # Get the actual ADE chunk type
#             ade_chunk_type = chunk_data.get("type", "text")
#             try:
#                 chunk_type = ChunkType(ade_chunk_type)
#             except ValueError:
#                 chunk_type = ChunkType.TEXT
#         else:
#             chunk_text = chunk_data
#             chunk_type = ChunkType.TEXT
        
#         # Clean up markdown artifacts
#         chunk_text = self._clean_chunk_text(chunk_text)
        
#         try:
#             # Invoke the chain with the chunk text
#             result = self.chain.invoke({"chunk_text": chunk_text})
            
#             # Override with actual chunk type from ADE
#             result.chunk_type = chunk_type
            
#             # Post-process results
#             result = self._post_process_results(result, chunk_id)
            
#             return result
            
#         except Exception as e:
#             # Return error response
#             return ScottEntryResponse(
#                 entries=[],
#                 has_stamp_info=False,
#                 chunk_type=chunk_type,
#                 parsing_notes=f"Parsing error: {str(e)}"
#             )
    
#     def _clean_chunk_text(self, text: str) -> str:
#         """Clean markdown and formatting artifacts"""
#         # Remove anchor tags
#         text = re.sub(r'<a id=[\'"][^\'"]+[\'"]></a>\n*', '', text)
#         # Remove figure markup but keep content
#         text = re.sub(r'<::(.*?)::>', r'\1', text, flags=re.DOTALL)
#         # Clean excessive whitespace
#         text = re.sub(r'\n{3,}', '\n\n', text)
#         return text.strip()
    
#     def _post_process_results(self, result: ScottEntryResponse, 
#                              chunk_id: Optional[str]) -> ScottEntryResponse:
#         """Post-process and validate parsing results"""
        
#         # Add chunk_id to metadata
#         if chunk_id:
#             for entry in result.entries:
#                 if not hasattr(entry, 'metadata'):
#                     entry.metadata = {}
#                 entry.metadata["chunk_id"] = chunk_id
        
#         # Validate Scott numbers
#         for entry in result.entries:
#             if not self._validate_scott_number(entry.scott_number):
#                 if result.parsing_notes:
#                     result.parsing_notes += f"\nWarning: Suspicious Scott number: {entry.scott_number}"
#                 else:
#                     result.parsing_notes = f"Warning: Suspicious Scott number: {entry.scott_number}"
        
#         return result
    
#     def _validate_scott_number(self, scott_number: str) -> bool:
#         """Validate Scott number format"""
#         pattern = r'^([A-Z]{0,4})?(\d+)([A-Z])?([a-z]+)?$'
#         return bool(re.match(pattern, scott_number))
    
#     def parse_chunks_batch(self, chunks: List[Union[str, Dict]], 
#                           show_progress: bool = True) -> Dict[str, Any]:
#         """
#         Parse multiple chunks in batch with progress tracking
        
#         Args:
#             chunks: List of chunks (text or dicts)
#             show_progress: Show progress messages
            
#         Returns:
#             Comprehensive results dictionary
#         """
#         results = []
#         errors = []
#         statistics = {
#             "total_chunks": len(chunks),
#             "chunks_with_stamps": 0,
#             "total_stamps": 0,
#             "total_varieties": 0,
#             "text_chunks": 0,
#             "table_chunks": 0,
#             "figure_chunks": 0,
#             "other_chunks": 0
#         }
        
#         for i, chunk in enumerate(chunks):
#             if show_progress:
#                 print(f"Processing chunk {i+1}/{len(chunks)}...")
            
#             try:
#                 response = self.parse_chunk(chunk)
                
#                 # Update chunk type statistics
#                 if response.chunk_type == ChunkType.TEXT:
#                     statistics["text_chunks"] += 1
#                 elif response.chunk_type == ChunkType.TABLE:
#                     statistics["table_chunks"] += 1
#                 elif response.chunk_type == ChunkType.FIGURE:
#                     statistics["figure_chunks"] += 1
#                 else:
#                     statistics["other_chunks"] += 1
                
#                 # Count stamps and varieties
#                 if response.entries:
#                     statistics["chunks_with_stamps"] += 1
#                     for entry in response.entries:
#                         statistics["total_stamps"] += 1
#                         if entry.is_variety:
#                             statistics["total_varieties"] += 1
                
#                 # Store results if there's useful information
#                 if response.entries or response.warnings or response.cross_references:
#                     result_data = {
#                         "chunk_index": i,
#                         "chunk_type": response.chunk_type.value,
#                         "section_header": response.section_header,
#                         "perforation": response.perforation_info,
#                         "entries": [entry.model_dump() for entry in response.entries],
#                         "cross_references": response.cross_references,
#                         "warnings": response.warnings,
#                         "summary": response.summary_line,
#                         "notes": response.parsing_notes
#                     }
                    
#                     # Add chunk_id if available
#                     if isinstance(chunk, dict) and "id" in chunk:
#                         result_data["chunk_id"] = chunk["id"]
                    
#                     results.append(result_data)
                    
#                     if show_progress:
#                         if response.entries:
#                             print(f"  ✓ Found {len(response.entries)} stamps")
#                         else:
#                             print(f"  ✓ Found metadata/references")
#                 else:
#                     if show_progress:
#                         print(f"  - No stamp data (type: {response.chunk_type.value})")
                
#             except Exception as e:
#                 error_data = {
#                     "chunk_index": i,
#                     "error": str(e),
#                     "chunk_preview": str(chunk)[:200] if isinstance(chunk, str) 
#                                     else chunk.get("markdown", "")[:200]
#                 }
#                 errors.append(error_data)
#                 if show_progress:
#                     print(f"  ✗ Error: {str(e)[:50]}...")
        
#         return {
#             "results": results,
#             "errors": errors,
#             "statistics": statistics,
#             "summary": self._generate_summary(statistics)
#         }
    
#     def _generate_summary(self, stats: Dict[str, Any]) -> str:
#         """Generate a human-readable summary"""
#         return (
#             f"Processed {stats['total_chunks']} chunks:\n"
#             f"- Found {stats['total_stamps']} stamp entries total\n"
#             f"  - Main stamps: {stats['total_stamps'] - stats['total_varieties']}\n"
#             f"  - Varieties: {stats['total_varieties']}\n"
#             f"- Chunks with stamps: {stats['chunks_with_stamps']}\n"
#             f"- Chunk types: {stats['text_chunks']} text, {stats['table_chunks']} table, "
#             f"{stats['figure_chunks']} figure, {stats['other_chunks']} other"
#         )
    
#     def save_results(self, results: Dict[str, Any], output_file: str):
#         """Save results to JSON with proper formatting"""
        
#         with open(output_file, 'w', encoding='utf-8') as f:
#             json.dump(results, f, indent=2, ensure_ascii=False, default=str)
        
#         print(f"\n✓ Results saved to {output_file}")
#         print("\nSummary:")
#         print(results.get("summary", "No summary available"))


# # ============================================================================
# # USAGE EXAMPLES
# # ============================================================================

# def test_with_real_chunks():
#     """Test with the real chunk examples provided"""
#     OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
#     # Initialize parser
#     parser = ScottCatalogExpertParser(
#         openai_api_key=OPENAI_API_KEY,
#         model_name="gpt-4o-mini"
#     )
    
#     # Real chunks from ADE Parse
#     test_chunks = [
                
#         {
#             "id": "ca72a5ed-984e-4264-9d63-f1ebf2fdc47e",
#             "grounding": {
#             "box": {
#                 "bottom": 0.8415362238883972,
#                 "left": 0.05093258619308472,
#                 "right": 0.268806517124176,
#                 "top": 0.6473554968833923
#             },
#             "page": 1
#             },
#             "markdown": "<a id='ca72a5ed-984e-4264-9d63-f1ebf2fdc47e'></a>\n\n1901, Jan.                                Perf. 12-15½\n45 A30    1c green & blk                  3.25    .30\na.        Horiz. pair, imperf. btwn.      150.00\n46 A31    2c ver & blk                    1.25    .30\n47 A32    5c gray blue &\n          blk                             3.25    .30\na.        Vert. pair, imperf. btwn.       —       300.00\n48 A33    10c ocher & blk                 3.25    .35\n49 A34    20c lake & blk                  22.50   .25\na.        Vert. pair, imperf. btwn.       1,000.\n50 A35    50c dull lil & dk bl            5.50    1.00\n51 A36    1col ol bis & blk               110.00  3.50\n52 A37    2col car rose & dk\n          grn                             16.00   3.00\n53 A38    5col brown & blk                75.00   3.50\n54 A39    10col yel grn & brn\n          red                             29.00   3.00\nNos. 45-54 (10)                           269.00  15.50\n\nThe 2c exists with center inverted. Value\n$77,500.\nNos. 45-57 in other colors are private\nreprints made in 1948. They have little value.\nFor surcharge and overprints see Nos. 58,\n78, O37-O44.",
#             "type": "text"
#         }
#     ]
    
#     # Process chunks
#     results = parser.parse_chunks_batch(test_chunks)
    
#     # Save results
#     parser.save_results(results, "costa_rica_stamps_test.json")
    
#     # Print detailed results
#     for result in results["results"]:
#         print(f"\n{'='*50}")
#         print(f"Chunk Type: {result['chunk_type']}")
#         if result['section_header']:
#             print(f"Section: {result['section_header']}")
#         print(f"Stamps found: {len(result['entries'])}")
        
#         for entry in result['entries']:
#             print(f"  - Scott #{entry['scott_number']}: {entry['denomination']} {entry['color']}")
#             if entry.get('mint_value'):
#                 print(f"    Values: ${entry['mint_value']} mint, ${entry.get('used_value', '-')} used")


# if __name__ == "__main__":
#     # Test with real chunks
#     test_with_real_chunks()