In [None]:
"""
Costa Rica Philatelic Knowledge Graph Extractor - EXPERT VERSION
Handles all Costa Rican denomination systems and catalog syntax
By: Philately & Regex Expert
"""

import json
import re
from typing import Dict, List, Tuple, Optional, Set
from dataclasses import dataclass, asdict, field
from bs4 import BeautifulSoup
import os

import os
import json
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
from matplotlib.patches import FancyBboxPatch
from collections import defaultdict
import pandas as pd

In [None]:
#!pip install networkx matplotlib pandas numpy

In [None]:

@dataclass
class PhilatelicNode:
    """Represents a philatelic entity node"""
    catalog_number: str
    node_type: str  # stamp, proof, specimen, essay, variety, error
    sub_type: Optional[str] = None  # die_proof, plate_proof, color_variety, etc.
    denomination: Optional[str] = None
    color: Optional[str] = None
    year: Optional[str] = None
    issue_name: Optional[str] = None  # From most recent <sec> element
    quantity: Optional[str] = None
    page_number: int = 0
    reading_order: int = 0
    raw_text: str = ""
    context_before: List[str] = field(default_factory=list)
    context_after: List[str] = field(default_factory=list)
    attributes: List[str] = field(default_factory=list)
    reference_numbers: List[str] = field(default_factory=list)
    base_stamp: Optional[str] = None  # For proofs/varieties: which stamp they belong to
    
    def to_dict(self):
        return {k: v for k, v in asdict(self).items() if v not in [None, [], ""]}

class CostaRicaCatalogParser:
    """Expert parser for Costa Rica philatelic catalog with full denomination support"""
    
    # COMPREHENSIVE DENOMINATION REGEX
    # Handles: centavos(c), Colones(C/col), pesos(p), reales(r), fractions(½, 1/2)
    # Full words: centimos, centavos, colones, pesos, reales
    DENOMINATION = re.compile(
        r'\b('  # Captura grupo 1: el valor
            r'\d+(?:[/.]?\d+)?'  # Números normales: 15, 1.5, 1/2
            r'|½|¼|¾'            # O símbolos de fracción standalone
        r')\s*'
        r'('  # Captura grupo 2: la unidad
            r'c|C|col|colón|colones|p|peso|pesos|r|real|reales|'
            r'centimo|centimos|céntimo|céntimos|centavo|centavos|céntavo|céntavos'
        r')\b',
        re.IGNORECASE
    )
    
    # Surcharge pattern: "1c on ½ real", "5c on 2 reales", etc.
    SURCHARGE = re.compile(
        r'(\d+)\s*c\s+on\s+([½¼¾\d/]+)\s*(real|reales)',
        re.IGNORECASE
    )
    
    # More precise: only match denomination after catalog number, not standalone
    DENOMINATION_AFTER_CATALOG = re.compile(
        r'(?:DP|PP|S|E|I|OP|^|\s)(\d+[a-z]?)\s+'  # Catalog number
        r'(\d+(?:[/.]?\d+|½|¼)?)\s*'  # Denomination value
        r'(c|C|col|colón|colones|p|peso|pesos|r|real|reales|'
        r'centimo|centimos|céntimo|céntimos|centavo|centavos)\b',
        re.IGNORECASE
    )
    
    # Color patterns - comprehensive
    COLOR_MODIFIERS = r'(deep|light|dark|bright|pale|dull|bright)'
    COLOR_NAMES = r'(black|violet|blue|red|green|orange|brown|yellow|gray|grey|' \
                  r'scarlet|ultramarine|carmine|olive|purple|pink|rose|magenta|' \
                  r'turquoise|indigo|vermillion|sepia|slate|cobalt|crimson)'
    COLORS = re.compile(rf'\b{COLOR_MODIFIERS}?\s*{COLOR_NAMES}\b', re.IGNORECASE)
    
    YEAR = re.compile(r'\b(18\d{2}|19\d{2}|20\d{2})\b')
    
    # Quantity: numbers with commas, but NOT single/double digit numbers
    # Must have comma OR be 4+ digits OR have "printed"/"issued" keyword
    QUANTITY = re.compile(
        r'\b(\d{1,3}(?:,\d{3})+)\b|'  # Try comma pattern FIRST
        r'(?:printed|issued|quantity)[:\s]+(\d{1,3}(?:,\d{3})+)\b',  # ← Añadir soporte para comas aquí también
        re.IGNORECASE
    )
    
    REFERENCE = re.compile(r'#([\w-]+)')
    
    ATTRIBUTES = [
        'imperf', 'imperforate', 'inverted', 'double perf', 'triple perf',
        'tete beche', 'tete-beche', 'téte-béche', 'shifted', 'gutter pair', 
        'sunk on card', 'no numeral', 'omitted center', 'inverted center', 
        'double op', 'horizontal pair', 'vertical pair', 'right margin', 
        'left margin', 'lower margin', 'upper margin', 'inverted op',
        'misplaced', 'offset', 'printed on both sides', 'blind perf',
        'photographic proof'
    ]
    
    def __init__(self):
        self.current_issue = None  # Most recent <sec> element
        self.current_sec = None          
        self.current_sub_sec = None      
        self.current_sub_sub_sec = None  
        self.current_section = None  # Proof, Regular issue, etc.
        self.current_year = None
        self.stamps_seen = set()
    
    def clean_latex(self, text: str) -> str:
        """Remove LaTeX formatting and extract content"""
        if '\\begin{array}' in text:
            match = re.search(r'\\begin\{array\}[^{]*\{(.*?)\\end\{array\}', text, re.DOTALL)
            if match:
                content = match.group(1)
                rows = content.split('\\\\')
                cleaned_rows = []
                for row in rows:
                    row = re.sub(r'\\text\s*\{\s*([^}]*)\s*\}', r'\1', row)
                    row = re.sub(r'\\mathrm\s*\{\s*([^}]*)\s*\}', r'\1', row)
                    row = re.sub(r'&', ' ', row)
                    row = re.sub(r'\\[a-zA-Z]+', '', row)
                    row = re.sub(r'[{}$\\]', '', row)
                    row = re.sub(r'\s+', ' ', row).strip()
                    if row:
                        cleaned_rows.append(row)
                return '\n'.join(cleaned_rows)
        
        text = re.sub(r'\$\^?\{(\w+)\}\$', r'\1', text)
        text = re.sub(r'\\text\s*\{\s*([^}]*)\s*\}', r'\1', text)
        text = re.sub(r'\\mathrm\s*\{\s*([^}]*)\s*\}', r'\1', text)
        text = re.sub(r'\\[a-zA-Z]+\{([^}]*)\}', r'\1', text)
        text = re.sub(r'\\[a-zA-Z]+', '', text)
        text = re.sub(r'[{}$]', '', text)
        text = re.sub(r'\s+', ' ', text)
        
        return text.strip()
    
    def normalize_denomination(self, value: str, unit: str) -> str:
        """Normalize denomination to standard format"""
        
        # CRITICAL: Check for capital C BEFORE any conversion
        if unit == 'C':  # Capital C = Colones
            unit_normalized = 'col'
        elif unit.lower() in ['c', 'centimo', 'centimos', 'céntimo', 'céntimos', 
                            'centavo', 'centavos', 'céntavo', 'céntavos']:
            unit_normalized = 'c'
        elif unit.lower() in ['col', 'colón', 'colones']:
            unit_normalized = 'col'
        elif unit.lower() in ['p', 'peso', 'pesos']:
            unit_normalized = 'p'
        elif unit.lower() in ['r', 'real', 'reales']:
            unit_normalized = 'r'
        else:
            unit_normalized = unit.lower()
        
        # Handle fractions
        if value == '½':
            value_normalized = '0.5'
        elif value == '¼':
            value_normalized = '0.25'
        elif value == '¾':
            value_normalized = '0.75'
        elif '/' in value:
            try:
                parts = value.split('/')
                result = float(parts[0]) / float(parts[1])
                value_normalized = str(result)  # str(float) automáticamente pone 0.25
            except:
                value_normalized = value
        else:
            value_normalized = value
        
        return f"{value_normalized}{unit_normalized}"
    
    def extract_base_stamp_number(self, catalog_num: str) -> Optional[str]:
        """Extract base stamp number from proof/specimen/variety"""
        match = re.match(r'[A-Z]*(\d+)[a-zA-Z]*$', catalog_num, re.IGNORECASE)
        if match:
            return match.group(1)
        return None
    
    def extract_stamp_color(self, line: str, catalog_num: str) -> Optional[str]:
        """
        Extract color of stamp, avoiding colors of overprints, frames, values.
        Expert strategy: Get color between denomination and first delimiter.
        """
        # Find text segment after catalog and denomination
        # Pattern: [catalog] [denom] [COLOR SHOULD BE HERE] [, or 'op' or other delimiter]
        
        # Escape special regex chars in catalog_num
        cat_escaped = re.escape(catalog_num)
        
        # Try to find: catalog_num + any_denomination + color_text
        pattern = rf'{cat_escaped}\s+\d+(?:[/.]?\d+)?\s*(?:c|C|col|p|r|centimos?|centavos?|colones?|pesos?|reales?)\s+([^,]+?)(?:,|\bop\b|\bimperf|\bperf|\bframe\b|\bvalue\b|$)'
        match = re.search(pattern, line, re.IGNORECASE)
        
        if match:
            color_section = match.group(1).strip()
            
            # Stop at certain keywords
            stop_words = ['op', 'specimen', 'frame', 'value', 'imperf', 'perf', 
                         'inverted', 'double', 'triple', 'sunk', 'with', 'hole',
                         'margin', 'pair', '#', 'quantity', 'printed']
            
            color_parts = []
            for word in color_section.split():
                if word.lower() in stop_words:
                    break
                color_parts.append(word)
            
            if color_parts:
                color_text = ' '.join(color_parts).strip()
                # Verify it contains actual color
                if self.COLORS.search(color_text):
                    return color_text
        
        # Fallback: Get first color before comma or 'op'
        text_before_delim = re.split(r',|\bop\b', line, maxsplit=1)[0]
        
        # But after the denomination
        denom_match = re.search(r'\d+(?:[/.]?\d+)?\s*(?:c|C|col|p|r|centim|centav|colon|peso|real)', 
                               text_before_delim, re.IGNORECASE)
        if denom_match:
            text_after_denom = text_before_delim[denom_match.end():].strip()
            colors_matches = self.COLORS.findall(text_after_denom)
            if colors_matches:
                if isinstance(colors_matches[0], tuple):
                    color_parts = [p.strip() for p in colors_matches[0] if p.strip()]
                    return ' '.join(color_parts) if color_parts else None
                return colors_matches[0]
        
        return None
    
    def extract_attributes(self, text: str) -> List[str]:
        """Extract philatelic attributes including overprint colors"""
        text_lower = text.lower()
        found = []
        
        for attr in self.ATTRIBUTES:
            if attr in text_lower:
                found.append(attr)
        
        # Special: overprint colors (store as attribute, not as stamp color)
        if 'op' in text_lower or 'overprint' in text_lower:
            # Pattern: "blue op" or "op in red" or "red overprint"
            op_color_patterns = [
                r'(black|blue|red|green|violet|brown|orange|yellow)\s+op\b',
                r'\bop\b.*?\bin\s+(black|blue|red|green|violet|brown|orange|yellow)',
                r'(black|blue|red|green|violet|brown|orange|yellow)\s+overprint'
            ]
            
            for pattern in op_color_patterns:
                matches = re.finditer(pattern, text_lower)
                for match in matches:
                    color = match.group(1)
                    attr_name = f'overprint_{color}'
                    if attr_name not in found:
                        found.append(attr_name)
        
        return found
    
    def parse_catalog_entry(self, line: str) -> List[Dict]:
        """Parse line and extract catalog entries with proper denomination handling"""
        entries = []
        
        # OCR CORRECTION
        line = re.sub(r'\b(DP|PP|S)I(\d)', r'\g<1>1\2', line)
        line = re.sub(r'\b(DP|PP|S)I([a-z])', r'\g<1>1\2', line)
        line = re.sub(r'\b(S|PP|DP)l([a-z])', r'\g<1>1\2', line)
        
        # FILTERS
        if re.search(r'Decree[s]?\s*#\s*\d+', line, re.IGNORECASE):
            return entries
        if re.search(r'\(Ref\s+[A-Z]+\s+\d+', line, re.IGNORECASE):
            return entries
        if re.search(r'Accord\s+\d+', line, re.IGNORECASE):
            return entries
        
        # Filter OCR corruption (2011 repeated 3+ times)
        match = re.search(r'\b(\d+[a-zA-Z]*)\b', line)
        if match:
            first_cat = match.group(1)
            count = len(re.findall(rf'\b{re.escape(first_cat)}\b', line))
            if count >= 3:
                return entries
        
        # NEW: Handle bare numbers separated by newlines (e.g., "5\n\n\n6\n\n\n7")
        bare_numbers = re.findall(r'^(\d+)$', line, re.MULTILINE)
        if bare_numbers and len(bare_numbers) >= 2:
            for num in bare_numbers:
                if int(num) <= 500:  # Reasonable stamp number
                    entries.append({
                        'catalog_number': num,
                        'node_type': 'stamp',
                        'sub_type': 'regular_issue',
                        'base_stamp': None
                    })
            return entries
        
        # NEW: Handle concatenated M entries (M12M12a1c green...)
        # Pattern: M## followed immediately by M##a or more M entries
        m_pattern = r'\b(M\d+[a-z]?)\b'
        m_matches = list(re.finditer(m_pattern, line, re.IGNORECASE))
        if len(m_matches) >= 2:
            for m_match in m_matches:
                cat_num = m_match.group(1).upper()
                base = self.extract_base_stamp_number(cat_num)
                if cat_num not in [e['catalog_number'] for e in entries]:
                    entries.append({
                        'catalog_number': cat_num,
                        'node_type': 'specimen',
                        'sub_type': 'specimen',
                        'base_stamp': base
                    })
            # If we found M entries, return early to avoid duplicate processing
            if entries:
                return entries
        
        # NEW: Handle varieties with space after number (LaTeX cleaned: "7 a surcharge...")
        # This handles the pattern after clean_latex converts "${ }^{7} \\mathrm{a}" to "7 a"
        variety_space_pattern = r'\b(\d+)\s+([a-z])\b'
        for match in re.finditer(variety_space_pattern, line):
            num = match.group(1)
            letter = match.group(2)
            cat_num = f"{num}{letter}"
            # Skip if this looks like a dimension or other non-catalog pattern
            if int(num) > 2000:
                continue
            if cat_num not in [e['catalog_number'] for e in entries]:
                entries.append({
                    'catalog_number': cat_num,
                    'node_type': 'variety',
                    'sub_type': 'variety',
                    'base_stamp': num
                })
        
        # Die Proof: DP###
        for match in re.finditer(r'\bDP(\d+[a-zA-Z]*)\b', line, re.IGNORECASE):
            cat_num = f"DP{match.group(1)}"
            base = self.extract_base_stamp_number(cat_num)
            entries.append({
                'catalog_number': cat_num,
                'node_type': 'proof',
                'sub_type': 'die_proof',
                'base_stamp': base
            })
        
        # Plate Proof: PP###
        for match in re.finditer(r'\bPP(\d+[a-zA-Z]*)\b', line, re.IGNORECASE):
            cat_num = f"PP{match.group(1)}"
            base = self.extract_base_stamp_number(cat_num)
            entries.append({
                'catalog_number': cat_num,
                'node_type': 'proof',
                'sub_type': 'plate_proof',
                'base_stamp': base
            })
        
        # Color Proof: CP###
        for match in re.finditer(r'\bCP(\d+[a-zA-Z]*)\b', line, re.IGNORECASE):
            cat_num = f"CP{match.group(1)}"
            base = self.extract_base_stamp_number(cat_num)
            entries.append({
                'catalog_number': cat_num,
                'node_type': 'proof',
                'sub_type': 'color_proof',
                'base_stamp': base
            })
        
        # Specimen: S### (but not in concatenated M pattern already handled)
        for match in re.finditer(r'\bS(\d+[a-zA-Z]*)\b', line, re.IGNORECASE):
            cat_num = f"S{match.group(1)}"
            base = self.extract_base_stamp_number(cat_num)
            entries.append({
                'catalog_number': cat_num,
                'node_type': 'specimen',
                'sub_type': 'specimen',
                'base_stamp': base
            })
        
        # Essay: E###
        for match in re.finditer(r'\bE([I\d]+[a-zA-Z]*)\b', line, re.IGNORECASE):
            cat_num = f"E{match.group(1)}"
            base = self.extract_base_stamp_number(cat_num)
            entries.append({
                'catalog_number': cat_num,
                'node_type': 'essay',
                'sub_type': 'essay',
                'base_stamp': base
            })
        
        # Imperforate: I###
        for match in re.finditer(r'\bI(\d+[a-zA-Z]*)\b', line, re.IGNORECASE):
            cat_num = f"I{match.group(1)}"
            base = self.extract_base_stamp_number(cat_num)
            entries.append({
                'catalog_number': cat_num,
                'node_type': 'variety',
                'sub_type': 'imperforate',
                'base_stamp': base
            })
        
        # Overprint Proof: OP###
        for match in re.finditer(r'\bOP(\d+[a-zA-Z]*)\b', line, re.IGNORECASE):
            cat_num = f"OP{match.group(1)}"
            base = self.extract_base_stamp_number(cat_num)
            entries.append({
                'catalog_number': cat_num,
                'node_type': 'proof',
                'sub_type': 'overprint_proof',
                'base_stamp': base
            })
        
        # Base stamp with denomination
        for match in re.finditer(
            r'(?:^|\s)(\d+[a-zA-Z]*)\s+(\d+(?:[/.]?\d+|½|¼)?)\s*'
            r'(c|C|col|colón|colones|p|peso|pesos|r|real|reales|'
            r'centim|céntim|centav|céntav)\b',
            line, re.IGNORECASE
        ):
            cat_num = match.group(1).strip()
            denom_value = match.group(2)
            
            try:
                if int(denom_value.split('.')[0].split('/')[0]) > 500:
                    continue
            except:
                pass
            
            if any(e['catalog_number'] == cat_num for e in entries):
                continue
            
            if re.match(r'\d+[a-zA-Z]+$', cat_num, re.IGNORECASE):
                base = self.extract_base_stamp_number(cat_num)
                entries.append({
                    'catalog_number': cat_num,
                    'node_type': 'variety',
                    'sub_type': 'variety',
                    'base_stamp': base
                })
            else:
                entries.append({
                    'catalog_number': cat_num,
                    'node_type': 'stamp',
                    'sub_type': 'regular_issue',
                    'base_stamp': None
                })
        
        return entries
    def extract_denominations(self, line: str) -> List[Tuple[str, str]]:
        """Extract denominations with their normalized values."""
        denoms = []
        
        for match in self.DENOMINATION.finditer(line):
            value = match.group(1)
            unit = match.group(2)
            
            # ONLY skip if it's EXACTLY a 2-character catalog pattern at word boundary
            # AND appears at the START of text (catalog numbers come first)
            matched_text = match.group(0)
            
            # Pattern: "1c" or "2a" appearing isolated (not "5c" in middle of sentence)
            if re.match(r'^[1-9][a-z]$', matched_text):
                # Check if this appears at the very start or after whitespace only
                match_pos = match.start()
                prefix = line[:match_pos].strip()
                
                # If there's meaningful text before this match, it's likely a real denomination
                if prefix and not re.match(r'^\d+$', prefix):
                    # Text before suggests this is mid-sentence denomination, keep it
                    pass
                elif not prefix or re.match(r'^\d+$', prefix):
                    # This is at start or after just a number - might be catalog number
                    # Skip ONLY if followed by space and descriptive text (not another number)
                    suffix = line[match.end():match.end()+20].strip()
                    if suffix and re.match(r'^[a-zA-Z]', suffix):
                        # Followed by letters = likely catalog number like "1c double"
                        continue
            
            # Skip quantities
            try:
                num_val = float(value.replace('½', '.5').replace('¼', '.25').split('/')[0])
                if num_val > 500:
                    continue
            except:
                pass
            
            denoms.append((value, unit))
        
        return denoms
    
    def parse_element(self, element: Dict, context_before: List[str], 
                 context_after: List[str], page_number: int) -> List[PhilatelicNode]:
        """Parse element and extract nodes - EXPERT VERSION"""
        text = element.get('text', '')
        label = element.get('label', '')
        reading_order = element.get('reading_order', 0)
        
        if label in ['header', 'foot', 'fig']:
            return []
        
        # CRITICAL: <sec>, <sub_sec>, <sub_sub_sec> define hierarchy
        # Priority: sub_sub_sec > sub_sec > sec
        if label == 'sec':
            self.current_sec = text
            if not hasattr(self, 'current_sub_sec') or not self.current_sub_sec:
                self.current_issue = text
            year_match = self.YEAR.search(text)
            if year_match:
                self.current_year = year_match.group(1)
            return []

        if label == 'sub_sec':
            self.current_sub_sec = text
            if not hasattr(self, 'current_sub_sub_sec') or not self.current_sub_sub_sec:
                self.current_issue = text
            return []

        if label == 'sub_sub_sec':
            self.current_sub_sub_sec = text
            self.current_issue = text
            return []
        
        # Section markers
        text_lower = text.lower().strip()
        if any(section in text_lower for section in ['proof', 'proofs', 'regular issue', 'essays', 'essay', 'plate proof', 'die proof', 'specimen', 'photographic proof']):
            for section in ['proof', 'proofs', 'regular issue', 'essays', 'essay', 'plate proof', 'die proof', 'specimen', 'regular issues', 'photographic proof', 'photographic proofs']:
                if section in text_lower:
                    self.current_section = section
                    break
        
        nodes = []
        
        # Handle tables
        if label == 'tab':
            nodes.extend(self._parse_table(text, page_number, reading_order, context_before))
            return nodes
        
        # Check if paragraph contains catalog entries
        # Special handling for bare catalog numbers separated by newlines
        if label == 'para':
            lines_temp = [l.strip() for l in text.split('\n') if l.strip()]
            bare_num_pattern = r'^\d+$'
            
            # Check if ALL lines are just bare numbers
            if len(lines_temp) >= 2 and all(re.match(bare_num_pattern, l) for l in lines_temp):
                for num in lines_temp:
                    if 1 <= int(num) <= 500:
                        node = PhilatelicNode(
                            catalog_number=num,
                            node_type='stamp',
                            sub_type='regular_issue',
                            denomination=None,
                            color=None,
                            year=self.current_year,
                            issue_name=self.current_issue,
                            quantity=None,
                            page_number=page_number,
                            reading_order=reading_order,
                            raw_text=text,
                            context_before=context_before[-3:],
                            context_after=context_after[:1],
                            attributes=[],
                            reference_numbers=[],
                            base_stamp=None
                        )
                        nodes.append(node)
                return nodes  # Only return for THIS paragraph
    
            # If not bare numbers, check for paragraph entries
            if re.search(r'(?:^|\n)\s*[A-Z]*\d+[a-z]*\s+', text):
                nodes.extend(self._parse_paragraph_entries(
                    text, page_number, reading_order, context_before
                ))
                if nodes:
                    return nodes
        
        # Clean and process remaining content
        clean_text = self.clean_latex(text)
        
        # DEBUG: Print cleaned text for specific problematic elements
        if page_number in [10, 11] and label == 'para' and reading_order in [17, 10, 11]:
            print(f"\n=== DEBUG PAGE {page_number}, ORDER {reading_order} ===")
            print(f"ORIGINAL: {text[:200]}")
            print(f"CLEANED:  {clean_text[:200]}")
            print(f"===\n")
        
        lines = [line.strip() for line in clean_text.split('\n') if line.strip()]
        
        for line in lines:
            entries = self.parse_catalog_entry(line)
            
            if not entries:
                continue
            
            # Extract denominations
            denoms_raw = self.extract_denominations(line)
            denominations = [self.normalize_denomination(v, u) for v, u in denoms_raw]
            
            # Extract colors
            colors = []
            for entry in entries:
                cat_num = entry['catalog_number']
                color = self.extract_stamp_color(line, cat_num)
                colors.append(color)
            
            # Fallback color extraction
            if not any(colors):
                colors_matches = self.COLORS.findall(line)
                for color_match in colors_matches:
                    if isinstance(color_match, tuple):
                        color_parts = [p.strip() for p in color_match if p.strip()]
                        if color_parts:
                            colors.append(' '.join(color_parts))
                    else:
                        colors.append(color_match.strip())
            
            # Extract quantity
            quantity = None
            qty_match = self.QUANTITY.search(line)
            if qty_match:
                raw_quantity = qty_match.group(1) or qty_match.group(2)
                quantity = raw_quantity.replace(',', '')
            
            references = self.REFERENCE.findall(line)
            attributes = self.extract_attributes(line)
            
            # Create nodes
            for i, entry in enumerate(entries):
                cat_num = entry['catalog_number']
                node_type = entry['node_type']
                sub_type = entry['sub_type']
                base_stamp = entry['base_stamp']
                
                if node_type == 'stamp' and cat_num in self.stamps_seen:
                    continue
                
                node = PhilatelicNode(
                    catalog_number=cat_num,
                    node_type=node_type,
                    sub_type=sub_type,
                    denomination=denominations[0] if denominations else None,
                    color=colors[i] if i < len(colors) else (colors[0] if colors else None),
                    year=self.current_year,
                    issue_name=self.current_issue,
                    quantity=quantity if node_type == 'stamp' else None,
                    page_number=page_number,
                    reading_order=reading_order,
                    raw_text=line,
                    context_before=context_before[-3:],
                    context_after=context_after[:1],
                    attributes=attributes,
                    reference_numbers=references,
                    base_stamp=base_stamp
                )
                
                if node_type == 'stamp' and self.current_section and 'regular issue' in self.current_section.lower():
                    self.stamps_seen.add(cat_num)
                
                nodes.append(node)
        
        return nodes
    def _parse_paragraph_entries(self, text: str, page_number: int, 
                             reading_order: int, context: List[str]) -> List[PhilatelicNode]:
        """Parse catalog entries from paragraph format (not tables)"""
        nodes = []
        
        # Pattern: catalog number at start of line/sentence
        # Examples: "5    1c on ½r blue", "7a   surcharge on 1A"
        pattern = r'(?:^|\n)\s*([A-Z]*\d+[a-z]*)\s+(.*?)(?=\n\s*[A-Z]*\d+[a-z]*\s+|\Z)'
        
        for match in re.finditer(pattern, text, re.MULTILINE | re.DOTALL):
            cat_num = match.group(1).strip()
            description = match.group(2).strip()
            
            # Determine node type
            if cat_num.startswith(('DP', 'PP', 'CP')):
                node_type = 'proof'
                sub_type = {'DP': 'die_proof', 'PP': 'plate_proof', 'CP': 'color_proof'}[cat_num[:2]]
                base_stamp = self.extract_base_stamp_number(cat_num)
            elif cat_num.startswith(('S', 'M')):
                node_type = 'specimen'
                sub_type = 'specimen'
                base_stamp = self.extract_base_stamp_number(cat_num)
            elif re.match(r'^\d+[a-z]$', cat_num):
                node_type = 'variety'
                sub_type = 'variety'
                base_stamp = cat_num[:-1]
            else:
                node_type = 'stamp'
                sub_type = 'regular_issue'
                base_stamp = None
            
            # Extract denomination
            denoms_raw = self.extract_denominations(description)
            denomination = self.normalize_denomination(*denoms_raw[0]) if denoms_raw else None
            
            # Check for surcharge pattern: "Xc on Yr"
            surcharge_match = re.search(r'(\d+)\s*c\s+on\s+([½¼\d/]+)\s*([rp])', description)
            if surcharge_match:
                new_val = surcharge_match.group(1)
                old_val = surcharge_match.group(2)
                old_unit = surcharge_match.group(3)
                denomination = f"{new_val}c on {old_val}{old_unit}"
            
            # Extract color
            color = self.extract_stamp_color(description, cat_num)
            
            # Extract attributes
            attributes = self.extract_attributes(description)
            
            node = PhilatelicNode(
                catalog_number=cat_num,
                node_type=node_type,
                sub_type=sub_type,
                denomination=denomination,
                color=color,
                year=self.current_year,
                issue_name=self.current_issue,
                quantity=None,
                page_number=page_number,
                reading_order=reading_order,
                raw_text=description,
                context_before=context[-3:],
                attributes=attributes,
                base_stamp=base_stamp
            )
            nodes.append(node)
        
        return nodes
    
    def _parse_table(self, html_text: str, page_number: int, 
                reading_order: int, context: List[str]) -> List[PhilatelicNode]:
        """Parse table elements with expert denomination handling"""
        soup = BeautifulSoup(html_text, 'html.parser')
        rows = []
        
        for tr in soup.find_all('tr'):
            cells = [td.get_text(strip=True) for td in tr.find_all('td')]
            if cells and not all(c == '' for c in cells):
                rows.append(cells)
        
        nodes = []
        last_denomination = None
        
        for row in rows:
            created_catalog_numbers = set(node.catalog_number for node in nodes)
            
            if not row or len(row) < 2:
                continue
            
            print(f"DEBUG: Processing table row with {len(row)} cells: {row}")
            
            # Skip headers
            header_keywords = ['perf', 'imperf', 'date', 'order', 'plate', 'essays']
            if len(row) <= 5 and all(any(kw in cell.lower() for kw in header_keywords) for cell in row if cell):
                print(f"DEBUG: Skipping header row")
                continue
            
            first_cell = row[0].strip()
            entries = []
            
            # Case 1: 2-cell variety rows
            if len(row) == 2 and re.match(r'^\d+[a-zA-Z]+$', first_cell):
                base = self.extract_base_stamp_number(first_cell)
                entries = [{
                    'catalog_number': first_cell,
                    'node_type': 'variety',
                    'sub_type': 'variety',
                    'base_stamp': base
                }]
                row_text = ' '.join(row)
                print(f"DEBUG: 2-cell variety row: {first_cell} -> base {base}")
                last_denomination = None
            
            # Case 2: 2-cell bare number rows
            elif len(row) == 2 and re.match(r'^\d+$', first_cell):
                desc = row[1].lower()
                variety_keywords = ['perf', 'diagonal', 'horizontal', 'cracked', 'impression', 'pair', 'inverted']
                
                # Special case: catalog #1 is likely a real stamp even with variety keywords
                if first_cell == '1' and any(kw in desc for kw in variety_keywords):
                    entries = [{
                        'catalog_number': first_cell,
                        'node_type': 'stamp',
                        'sub_type': 'regular_issue',
                        'base_stamp': None
                    }]
                    row_text = ' '.join(row)
                    print(f"DEBUG: Special case - stamp #1")
                    last_denomination = None
                
                elif any(kw in desc for kw in variety_keywords):
                    print(f"DEBUG: Row {first_cell} appears to be variety with missing letter, skipping")
                    continue
                else:
                    entries = [{
                        'catalog_number': first_cell,
                        'node_type': 'stamp',
                        'sub_type': 'regular_issue',
                        'base_stamp': None
                    }]
                    row_text = ' '.join(row)
                    print(f"DEBUG: 2-cell stamp row: {first_cell}")
                    last_denomination = None
            
            # Case 3: Multi-cell rows (3+)
            elif len(row) >= 3 and re.match(r'^\d+[a-zA-Z]*$', first_cell):
                rest_of_row = ' '.join(row[1:])
                row_text_full = ' '.join(row)
                
                print(f"DEBUG: Found catalog number in first cell: {first_cell}")
                
                # Subcase 3a: It's a variety
                if re.match(r'^\d+[a-zA-Z]+$', first_cell):
                    base = self.extract_base_stamp_number(first_cell)
                    entries = [{
                        'catalog_number': first_cell,
                        'node_type': 'variety',
                        'sub_type': 'variety',
                        'base_stamp': base
                    }]
                    print(f"DEBUG: Identified as variety: {first_cell} -> base {base}")
                    
                    # Check column 2 for additional varieties
                    if len(row) >= 2:
                        second_cell = row[1].strip()
                        # Fixed version - only single letters or specific patterns:
                        additional_varieties = []
                        # Pattern 1: Single lowercase letter (a, b, c)
                        for match in re.finditer(r'\b([a-z])\b', second_cell):
                            additional_varieties.append(match.group(1))
                        # Pattern 2: Uppercase + lowercase (like "Aa" from "1Aa")
                        for match in re.finditer(r'([A-Z][a-z])', second_cell):
                            additional_varieties.append(match.group(1))
                        
                        # Filter out if it's part of a color word
                        color_words = ['dark', 'light', 'deep', 'pale', 'bright', 'red', 'green', 'blue', 
                                    'violet', 'yellow', 'orange', 'brown', 'black', 'white', 'gray', 'rose']
                        additional_varieties = [v for v in additional_varieties 
                                            if not any(word.startswith(v.lower()) for word in color_words)]
                        
                        for var_code in additional_varieties:
                            full_cat = f"{base}{var_code}"
                            # Skip duplicates
                            if full_cat not in [e['catalog_number'] for e in entries] and full_cat != first_cell and full_cat not in created_catalog_numbers:
                                entries.append({
                                    'catalog_number': full_cat,
                                    'node_type': 'variety',
                                    'sub_type': 'variety',
                                    'base_stamp': base
                                })
                                print(f"DEBUG: Additional variety: {full_cat}")
                
                # Subcase 3b: It's a stamp
                else:
                    denoms_raw = self.extract_denominations(rest_of_row)
                    
                    if denoms_raw:
                        # Found denomination in rest of row
                        entries = [{
                            'catalog_number': first_cell,
                            'node_type': 'stamp',
                            'sub_type': 'regular_issue',
                            'base_stamp': None
                        }]
                        print(f"DEBUG: Stamp with denomination in row: {first_cell}")
                    else:
                        # No denomination found - use heuristics
                        second_cell = row[1].strip() if len(row) > 1 else ""
                        third_col_text = ' '.join(row[2:]) if len(row) >= 3 else ""
                        
                        # Heuristic 1: Column 2 is a variety code
                        if re.match(r'^\d+[a-z]$', second_cell):
                            denoms_in_desc = self.extract_denominations(third_col_text)
                            last_col = row[-1].strip() if row else ""
                            
                            if denoms_in_desc:
                                entries = [{
                                    'catalog_number': first_cell,
                                    'node_type': 'stamp',
                                    'sub_type': 'regular_issue',
                                    'base_stamp': None
                                }]
                                rest_of_row = third_col_text
                                print(f"DEBUG: Stamp {first_cell} - denom in column 3")
                            elif re.match(r'^\d{2,3},\d{3}$', last_col):
                                entries = [{
                                    'catalog_number': first_cell,
                                    'node_type': 'stamp',
                                    'sub_type': 'regular_issue',
                                    'base_stamp': None
                                }]
                                rest_of_row = third_col_text
                                print(f"DEBUG: Stamp {first_cell} inferred from quantity")
                                last_denomination = None
                            else:
                                print(f"DEBUG: No denomination for {first_cell}, skipping")
                                continue
                            
                            # Add variety from column 2
                            base_of_variety = second_cell[:-1]
                            entries.append({
                                'catalog_number': second_cell,
                                'node_type': 'variety',
                                'sub_type': 'variety',
                                'base_stamp': base_of_variety
                            })
                            print(f"DEBUG: Added variety {second_cell}")
                        else:
                            print(f"DEBUG: No denomination for {first_cell}, skipping")
                            continue
                    
                    # Check for variety in same row (after stamp created)
                    if entries and entries[0]['node_type'] == 'stamp' and len(row) >= 2:
                        second_cell = row[1].strip()
                        if re.match(r'^\d+[a-z]$', second_cell) and second_cell not in [e['catalog_number'] for e in entries]:
                            base_of_variety = second_cell[:-1]
                            entries.append({
                                'catalog_number': second_cell,
                                'node_type': 'variety',
                                'sub_type': 'variety',
                                'base_stamp': base_of_variety
                            })
                            print(f"DEBUG: Variety {second_cell} in same row")
                
                row_text = row_text_full
            
            # Case 4: Normal parsing
            else:
                row_text = ' '.join(row)
                print(f"DEBUG: Normal parsing")
                entries = self.parse_catalog_entry(row_text)
            
            print(f"DEBUG: Found {len(entries)} entries: {[e['catalog_number'] for e in entries]}")
            
            if entries:
                surcharge_match = self.SURCHARGE.search(row_text)
                surcharge_info = None
                if surcharge_match:
                    new_value = surcharge_match.group(1)
                    old_value = surcharge_match.group(2)
                    old_unit = surcharge_match.group(3)
                    surcharge_info = f"{new_value}c on {old_value} {old_unit}"
                
                denoms_raw = self.extract_denominations(row_text)
                denominations = [self.normalize_denomination(v, u) for v, u in denoms_raw]
                print(f"DEBUG: Denominations: {denominations}")
                
                if not denominations and last_denomination:
                    denominations = [last_denomination]
                    print(f"DEBUG: Using last denomination: {last_denomination}")
                elif denominations:
                    last_denomination = denominations[0]
                
                if not denominations:
                    special_match = re.search(
                        r'([E]\d+[a-zA-Z]*(?:\s+[E]\d+[a-zA-Z]*)*)\s+(\d+)\s+',
                        row_text, re.IGNORECASE
                    )
                    if special_match:
                        denom_value = special_match.group(2)
                        denominations = [self.normalize_denomination(denom_value, 'c')]
                        last_denomination = denominations[0]
                
                colors = []
                for entry in entries:
                    color = self.extract_stamp_color(row_text, entry['catalog_number'])
                    colors.append(color)
                
                attributes = self.extract_attributes(row_text)
                
                quantity = None
                qty_match = self.QUANTITY.search(row_text)
                if qty_match:
                    raw_quantity = qty_match.group(1) or qty_match.group(2)
                    quantity = raw_quantity.replace(',', '')
                    print(f"DEBUG: Quantity: {quantity}")
                
                for i, entry in enumerate(entries):
                    cat_num = entry['catalog_number']
                    node_type = entry['node_type']
                    sub_type = entry['sub_type']
                    base_stamp = entry['base_stamp']
                    
                    final_attributes = attributes.copy()
                    if surcharge_info:
                        final_attributes.append(f'surcharge_{surcharge_info}')
                    
                    if node_type == 'stamp' and cat_num in self.stamps_seen:
                        print(f"DEBUG: Skipping duplicate stamp {cat_num}")
                        continue
                    
                    node = PhilatelicNode(
                        catalog_number=cat_num,
                        node_type=node_type,
                        sub_type=sub_type,
                        denomination=denominations[0] if denominations else None,
                        color=colors[i] if i < len(colors) else (colors[0] if colors else None),
                        year=self.current_year,
                        issue_name=self.current_issue,
                        quantity=quantity if node_type == 'stamp' else None,
                        page_number=page_number,
                        reading_order=reading_order,
                        raw_text=row_text,
                        context_before=context[-3:],
                        attributes=final_attributes,
                        base_stamp=base_stamp
                    )
                    nodes.append(node)
                    print(f"DEBUG: Created {cat_num} - {node.denomination} {node.color}")
                    
                    if node_type == 'stamp' and self.current_section and 'regular' in self.current_section.lower():
                        self.stamps_seen.add(cat_num)
                        print(f"DEBUG: Added {cat_num} to stamps_seen")
            
        return nodes
class PhilatelicExtractor:
    """Main extractor with expert Costa Rica knowledge"""
    
    def __init__(self, context_window: Tuple[int, int] = (-3, 1)):
        self.context_window = context_window
        self.parser = CostaRicaCatalogParser()
    
    def get_context(self, elements: List[Dict], index: int) -> Tuple[List[str], List[str]]:
        """Get context around element"""
        start = max(0, index + self.context_window[0])
        end_after = min(len(elements), index + self.context_window[1] + 1)
        
        context_before = []
        for i in range(start, index):
            text = elements[i].get('text', '')
            if text and elements[i].get('label') not in ['header', 'foot', 'fig']:
                context_before.append(text)
        
        context_after = []
        for i in range(index + 1, end_after):
            text = elements[i].get('text', '')
            if text and elements[i].get('label') not in ['header', 'foot', 'fig']:
                context_after.append(text)
        
        return context_before, context_after
    
    def process_page(self, page_data: Dict) -> List[PhilatelicNode]:
        """Process single page"""
        elements = page_data.get('elements', [])
        page_number = page_data.get('page_number', 0)
        
        elements = sorted(elements, key=lambda x: x.get('reading_order', 0))
        
        # Reset state for new page
        # BUT keep issue_name if no new <sec> found (issues can span pages)
        self.parser.current_section = None
        self.parser.current_sub_sec = None
        self.parser.current_sub_sub_sec = None
        # Don't reset current_issue - it carries over pages
        # Don't reset current_year - it carries over
        # Don't reset stamps_seen - we need to track across pages
        
        all_nodes = []
        
        for i, element in enumerate(elements):
            context_before, context_after = self.get_context(elements, i)
            nodes = self.parser.parse_element(element, context_before, context_after, page_number)
            all_nodes.extend(nodes)
        
        return all_nodes
    
    def build_relationships(self, nodes: List[PhilatelicNode]) -> List[Dict]:
        """Build relationships between nodes"""
        relationships = []
        
        # Index nodes by catalog number
        by_catalog = {n.catalog_number: n for n in nodes}
        
        for node in nodes:
            if node.base_stamp and node.base_stamp in by_catalog:
                base = by_catalog[node.base_stamp]
                relationships.append({
                    'from': node.catalog_number,
                    'to': base.catalog_number,
                    'type': f'{node.node_type.upper()}_OF',
                    'sub_type': node.sub_type,
                    'description': f"{node.catalog_number} ({node.sub_type}) is a {node.node_type} of stamp {base.catalog_number}"
                })
        
        return relationships
    
    def process_sample_pages(self, input_path: str, start_page: int = 30, num_pages: int = 2) -> Dict:
        """Process sample pages"""
        with open(input_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        if isinstance(data, dict) and 'page_number' in data:
            pages = [data]
        elif isinstance(data, list):
            pages = data
        else:
            pages = []
        
        pages = [p for p in pages if p.get('page_number', 0) >= start_page 
                 and p.get('page_number', 0) < start_page + num_pages]
        
        all_nodes = []
        for page in pages:
            nodes = self.process_page(page)
            all_nodes.extend(nodes)
        
        # Build relationships
        relationships = self.build_relationships(all_nodes)
        
        result = {
            'total_nodes': len(all_nodes),
            'nodes_by_type': {},
            'all_nodes': [node.to_dict() for node in all_nodes],
            'relationships': relationships,
            'relationship_count': len(relationships)
        }
        
        for node in all_nodes:
            node_type = node.node_type
            if node_type not in result['nodes_by_type']:
                result['nodes_by_type'][node_type] = []
            result['nodes_by_type'][node_type].append(node.to_dict())
        
        return result
    
    def inspect_json_structure(self, input_path: str, max_pages: int = 3):
        """Inspect JSON"""
        with open(input_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        if isinstance(data, dict) and 'page_number' in data:
            pages = [data]
        elif isinstance(data, list):
            pages = data
        else:
            print("Unexpected JSON structure")
            return
        
        print(f"Total pages: {len(pages)}")
        print(f"\nInspecting first {min(max_pages, len(pages))} pages:\n")
        
        for page in pages[:max_pages]:
            print(f"Page {page.get('page_number')}:")
            elements = page.get('elements', [])
            print(f"  Elements: {len(elements)}")
            
            types = {}
            for el in elements:
                label = el.get('label', 'unknown')
                types[label] = types.get(label, 0) + 1
            
            print(f"  Types: {types}")
            
            # Show <sec> elements (issue names)
            sec_elements = [el for el in elements if el.get('label') == 'sec']
            if sec_elements:
                print(f"  Issues found:")
                for sec in sec_elements:
                    print(f"    - {sec.get('text', 'N/A')}")
            print()
    
    def process_file(self, input_path: str, output_path: str):
        """Process entire file"""
        with open(input_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        if isinstance(data, dict) and 'page_number' in data:
            pages = [data]
        elif isinstance(data, list):
            pages = data
        else:
            pages = []
        
        all_nodes = []
        # Reset parser state at start
        self.parser.current_issue = None
        self.parser.current_section = None
        self.parser.current_year = None
        self.parser.stamps_seen = set()
        
        for page in pages:
            nodes = self.process_page(page)
            all_nodes.extend(nodes)
        
        # Build relationships
        relationships = self.build_relationships(all_nodes)
        
        output = {
            'total_nodes': len(all_nodes),
            'pages_processed': len(pages),
            'nodes': [node.to_dict() for node in all_nodes],
            'relationships': relationships,
            'summary': {
                'by_type': {},
                'by_page': {},
                'by_issue': {},
                'by_denomination': {},
                'relationship_count': len(relationships)
            }
        }
        
        # Generate statistics
        for node in all_nodes:
            node_type = node.node_type
            page_num = node.page_number
            
            # By type
            output['summary']['by_type'][node_type] = \
                output['summary']['by_type'].get(node_type, 0) + 1
            
            # By page
            output['summary']['by_page'][str(page_num)] = \
                output['summary']['by_page'].get(str(page_num), 0) + 1
            
            # By issue
            if node.issue_name:
                output['summary']['by_issue'][node.issue_name] = \
                    output['summary']['by_issue'].get(node.issue_name, 0) + 1
            
            # By denomination
            if node.denomination:
                output['summary']['by_denomination'][node.denomination] = \
                    output['summary']['by_denomination'].get(node.denomination, 0) + 1
        
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(output, f, indent=2, ensure_ascii=False)
        
        return output

In [None]:
def test_denomination_extraction():
    """Test comprehensive denomination extraction"""
    
    parser = CostaRicaCatalogParser()
    
    test_cases = [
        # Centavos/céntimos
        {'text': '99 15c black', 'expected': '15c', 'note': 'Standard centavos'},
        {'text': 'DP100 5c violet', 'expected': '5c', 'note': 'Small value'},
        {'text': 'PP101 100c orange', 'expected': '100c', 'note': '100 centavos (not Colones)'},
        {'text': 'S102 15centimos black', 'expected': '15c', 'note': 'Full word "centimos"'},
        
        # Colones
        {'text': '200 1col blue', 'expected': '1col', 'note': 'One colón'},
        {'text': '201 5C green', 'expected': '5col', 'note': 'Capital C = Colones'},
        {'text': '202 2colones red', 'expected': '2col', 'note': 'Full word "colones"'},
        
        # Pesos
        {'text': '50 1p black', 'expected': '1p', 'note': '1 peso'},
        {'text': '51 10pesos violet', 'expected': '10p', 'note': 'Full word "pesos"'},
        
        # Reales with fractions
        {'text': '10 ½r black', 'expected': '0.5r', 'note': 'Half real'},
        {'text': '11 1/2r violet', 'expected': '0.5r', 'note': 'Fraction 1/2 real'},
        {'text': '12 2r green', 'expected': '2r', 'note': '2 reales'},
        {'text': '13 ¼r orange', 'expected': '0.25r', 'note': 'Quarter real'},
        
        # Edge cases
        {'text': '99 15c deep violet 1,000,000', 'expected': '15c', 'note': 'Implicit "c", quantity present'},
        {'text': 'DP99 15c black #34009', 'expected': '15c', 'note': 'With reference number'},
    ]
    
    print("="*80)
    print("TEST 1: DENOMINATION EXTRACTION")
    print("="*80)
    print()
    
    passed = 0
    failed = 0
    
    for i, test in enumerate(test_cases, 1):
        print(f"{i}. {test['text']}")
        print(f"   Note: {test['note']}")
        
        denoms_raw = parser.extract_denominations(test['text'])
        
        if denoms_raw:
            denom_normalized = parser.normalize_denomination(denoms_raw[0][0], denoms_raw[0][1])
            
            if denom_normalized == test['expected']:
                print(f"   ✅ PASS - Extracted: {denom_normalized}")
                passed += 1
            else:
                print(f"   ❌ FAIL - Expected: {test['expected']}, Got: {denom_normalized}")
                failed += 1
        else:
            print(f"   ❌ FAIL - No denomination extracted")
            failed += 1
        print()
    
    print(f"Results: {passed}/{len(test_cases)} passed\n")
    return passed, failed

def test_issue_name_tracking():
    """Test that issue names from <sec> are properly tracked"""
    
    print("="*80)
    print("TEST 2: ISSUE NAME TRACKING (from <sec> elements)")
    print("="*80)
    print()
    
    # Simulate page with multiple issues
    test_page = {
        'page_number': 30,
        'elements': [
            {'label': 'sec', 'text': 'Simon Bolivar Birthday issue', 'reading_order': 1},
            {'label': 'para', 'text': 'July 24, 1921. Decree #18', 'reading_order': 2},
            {'label': 'para', 'text': 'Proof', 'reading_order': 3},
            {'label': 'para', 'text': 'DP99 15c black', 'reading_order': 4},
            {'label': 'para', 'text': 'Regular issue', 'reading_order': 5},
            {'label': 'para', 'text': '99 15c deep violet', 'reading_order': 6},
            
            {'label': 'sec', 'text': 'Central America Independence issue', 'reading_order': 7},
            {'label': 'para', 'text': 'September 15, 1921', 'reading_order': 8},
            {'label': 'para', 'text': 'PP100 5c violet', 'reading_order': 9},
            {'label': 'para', 'text': '100 5c violet', 'reading_order': 10},
        ]
    }
    
    extractor = PhilatelicExtractor()
    nodes = extractor.process_page(test_page)
    
    print(f"Extracted {len(nodes)} nodes\n")
    
    expected_issues = {
        'DP99': 'Simon Bolivar Birthday issue',
        '99': 'Simon Bolivar Birthday issue',
        'PP100': 'Central America Independence issue',
        '100': 'Central America Independence issue'
    }
    
    passed = 0
    failed = 0
    
    for cat_num, expected_issue in expected_issues.items():
        matching = [n for n in nodes if n.catalog_number == cat_num]
        
        if matching:
            node = matching[0]
            if node.issue_name == expected_issue:
                print(f"✅ {cat_num}: '{node.issue_name}'")
                passed += 1
            else:
                print(f"❌ {cat_num}: Expected '{expected_issue}', got '{node.issue_name}'")
                failed += 1
        else:
            print(f"❌ {cat_num}: Node not found")
            failed += 1
    
    print(f"\nResults: {passed}/{len(expected_issues)} passed\n")
    return passed, failed

def test_quantity_vs_denomination():
    """Test that quantities are not confused with denominations"""
    
    print("="*80)
    print("TEST 3: QUANTITY vs DENOMINATION DISAMBIGUATION")
    print("="*80)
    print()
    
    test_cases = [
        {
            'text': '99 15c deep violet 1,000,000',
            'catalog': '99',
            'expected_denom': '15c',
            'expected_qty': '1,000,000',
            'note': 'Quantity has comma'
        },
        {
            'text': 'DP99 15c black #34009',
            'catalog': 'DP99',
            'expected_denom': '15c',
            'expected_qty': None,
            'note': 'No quantity'
        },
        {
            'text': '100 5c violet printed 500,000',
            'catalog': '100',
            'expected_denom': '5c',
            'expected_qty': '500,000',
            'note': 'Quantity with "printed" keyword'
        },
    ]
    
    parser = CostaRicaCatalogParser()
    passed = 0
    failed = 0
    
    for i, test in enumerate(test_cases, 1):
        print(f"{i}. {test['text']}")
        print(f"   Note: {test['note']}")
        
        # Extract denomination
        denoms_raw = parser.extract_denominations(test['text'])
        denom = parser.normalize_denomination(denoms_raw[0][0], denoms_raw[0][1]) if denoms_raw else None
        
        # Extract quantity
        qty_match = parser.QUANTITY.search(test['text'])
        qty = qty_match.group(1) or qty_match.group(2) if qty_match else None
        
        denom_ok = (denom == test['expected_denom'])
        qty_ok = (qty == test['expected_qty'])
        
        if denom_ok and qty_ok:
            print(f"   ✅ PASS - Denom: {denom}, Qty: {qty}")
            passed += 1
        else:
            print(f"   ❌ FAIL")
            if not denom_ok:
                print(f"      Denom: Expected {test['expected_denom']}, Got {denom}")
            if not qty_ok:
                print(f"      Qty: Expected {test['expected_qty']}, Got {qty}")
            failed += 1
        print()
    
    print(f"Results: {passed}/{len(test_cases)} passed\n")
    return passed, failed

def test_color_extraction_expert():
    """Expert-level color extraction test"""
    
    print("="*80)
    print("TEST 4: EXPERT COLOR EXTRACTION")
    print("="*80)
    print()
    
    parser = CostaRicaCatalogParser()
    
    test_cases = [
        {
            'text': 'S99 15c violet, op "specimen" in red with hole',
            'catalog': 'S99',
            'expected_color': 'violet',
            'should_have_attr': 'overprint_red',
            'note': 'Red is overprint color, not stamp color'
        },
        {
            'text': '106 1c brown, blue op inverted op',
            'catalog': '106',
            'expected_color': 'brown',
            'should_have_attr': 'overprint_blue',
            'note': 'Blue is overprint, brown is stamp'
        },
        {
            'text': 'PP99 15centimos light gray violet, imperf',
            'catalog': 'PP99',
            'expected_color': 'light gray violet',
            'note': 'Compound color with full word denomination'
        },
        {
            'text': '50 1p black on green',
            'catalog': '50',
            'expected_color': 'black',
            'note': 'Bicolor: black on green paper'
        },
    ]
    
    passed = 0
    failed = 0
    
    for i, test in enumerate(test_cases, 1):
        print(f"{i}. {test['text']}")
        print(f"   Note: {test['note']}")
        
        color = parser.extract_stamp_color(test['text'], test['catalog'])
        attrs = parser.extract_attributes(test['text'])
        
        color_ok = (color == test['expected_color'])
        attr_ok = True
        
        if 'should_have_attr' in test:
            attr_ok = test['should_have_attr'] in attrs
        
        if color_ok and attr_ok:
            print(f"   ✅ PASS - Color: {color}")
            if 'should_have_attr' in test:
                print(f"      Attribute found: {test['should_have_attr']}")
            passed += 1
        else:
            print(f"   ❌ FAIL")
            if not color_ok:
                print(f"      Expected color: {test['expected_color']}, Got: {color}")
            if not attr_ok:
                print(f"      Missing attribute: {test['should_have_attr']}")
            failed += 1
        print()
    
    print(f"Results: {passed}/{len(test_cases)} passed\n")
    return passed, failed

def test_full_integration():
    """Full integration test with real pages 30-31 from file"""
    
    print("="*80)
    print("TEST 5: FULL INTEGRATION (Real Pages 30-31 from file)")
    print("="*80)
    print()
    
    
    input_path = "./results/recognition_json/Mena 2018 CRPC .json"
    
    if not os.path.exists(input_path):
        print(f"⚠️  File not found: {input_path}")
        print("   Using minimal sample data instead\n")
        
        # Fallback to minimal sample
        sample_data = [{
            'page_number': 30,
            'elements': [
                {"label": "sec", "text": "Simon Bolivar Birthday issue", "reading_order": 8},
                {"label": "para", "text": "DP99 15c black", "reading_order": 11},
                {"label": "para", "text": "99 15c deep violet", "reading_order": 17}
            ]
        }]
    else:
        with open(input_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        # CORRECCIÓN: Extraer páginas del diccionario
        if isinstance(data, dict) and 'pages' in data:
            all_pages = data['pages']
        elif isinstance(data, list):
            all_pages = data
        else:
            print("⚠️  Unexpected JSON structure")
            all_pages = []
        
        # Filter pages 30-31
        sample_data = [p for p in all_pages if p.get('page_number') in [10, 11]]
        print(f"Loaded {len(sample_data)} pages from file\n")
        print(f"Total pages in catalog: {len(all_pages)}\n")
    
    extractor = PhilatelicExtractor()
    
    all_nodes = []
    for page in sample_data:
        nodes = extractor.process_page(page)
        all_nodes.extend(nodes)
    
    print(f"Total nodes extracted: {len(all_nodes)}")
    print(f"Expected for pages 30-31: 58-70 nodes\n")
    
    # Group by issue
    by_issue = {}
    for node in all_nodes:
        issue = node.issue_name or 'No issue'
        if issue not in by_issue:
            by_issue[issue] = []
        by_issue[issue].append(node)
    
    print("Nodes by Issue:")
    for issue, nodes in by_issue.items():
        print(f"  {issue}: {len(nodes)} nodes")
    print()
    
    # Group by type
    by_type = {}
    for node in all_nodes:
        if node.node_type not in by_type:
            by_type[node.node_type] = []
        by_type[node.node_type].append(node)
    
    print("Nodes by Type:")
    for node_type, nodes in sorted(by_type.items()):
        print(f"  {node_type:15s}: {len(nodes)} nodes")
    print()
    
    # Validation checks - GENERALIZED
    checks = []
    
    # Check 1: All nodes have issue_name
    nodes_with_issue = [n for n in all_nodes if n.issue_name]
    checks.append({
        'name': 'All nodes have issue_name',
        'passed': len(nodes_with_issue) == len(all_nodes),
        'detail': f"{len(nodes_with_issue)}/{len(all_nodes)}"
    })
    
    # Check 2: All nodes have denomination
    nodes_with_denom = [n for n in all_nodes if n.denomination]
    checks.append({
        'name': 'All nodes have denomination',
        'passed': len(nodes_with_denom) == len(all_nodes),
        'detail': f"{len(nodes_with_denom)}/{len(all_nodes)}"
    })
    
    # Check 3: Multiple issues detected (GENERALIZED)
    unique_issues = set(n.issue_name for n in all_nodes if n.issue_name)
    checks.append({
        'name': 'Multiple issues detected',
        'passed': len(unique_issues) >= 1,
        'detail': f"{len(unique_issues)} unique issues found"
    })
    
    # Check 4: Overprint colors captured as attributes (GENERALIZED)
    nodes_with_op_attrs = [n for n in all_nodes 
                          if any('overprint' in attr.lower() for attr in n.attributes)]
    checks.append({
        'name': 'Overprint colors in attributes',
        'passed': True,  # Non-critical, just informational
        'detail': f"{len(nodes_with_op_attrs)} nodes with overprint attributes"
    })
    
    # Check 5: Base stamps exist
    base_stamps = [n for n in all_nodes if n.node_type == 'stamp' and not n.base_stamp]
    checks.append({
        'name': 'Base stamps detected',
        'passed': len(base_stamps) > 0,
        'detail': f"{len(base_stamps)} base stamps"
    })
    
    # Check 6: Varieties/Proofs have base_stamp reference
    derived_nodes = [n for n in all_nodes if n.node_type in ['proof', 'variety', 'specimen']]
    with_base = [n for n in derived_nodes if n.base_stamp]
    checks.append({
        'name': 'Derived nodes have base_stamp',
        'passed': len(with_base) > 0 if derived_nodes else True,
        'detail': f"{len(with_base)}/{len(derived_nodes)} have base reference" if derived_nodes else "N/A"
    })
    
    # Check 7: Relationships exist
    relationships = extractor.build_relationships(all_nodes)
    checks.append({
        'name': 'Relationships built',
        'passed': len(relationships) > 0,
        'detail': f"{len(relationships)} relationships"
    })
    
    # Check 8: Year captured
    nodes_with_year = [n for n in all_nodes if n.year]
    checks.append({
        'name': 'Year information captured',
        'passed': len(nodes_with_year) > 0,
        'detail': f"{len(nodes_with_year)} nodes with year info"
    })
    
    # Check 9: Node count reasonable
    reasonable_count = 50 <= len(all_nodes) <= 80
    checks.append({
        'name': 'Node count within expected range',
        'passed': reasonable_count,
        'detail': f"{len(all_nodes)} nodes (expected 58-70)"
    })
    
    # Print validation results
    print("="*80)
    print("VALIDATION RESULTS")
    print("="*80)
    print()
    
    passed = sum(1 for c in checks if c['passed'])
    
    for check in checks:
        status = '✅' if check['passed'] else '❌'
        print(f"{status} {check['name']}: {check['detail']}")
    
    print(f"\nValidation: {passed}/{len(checks)} checks passed\n")
    
    # Show detailed breakdown
    print("="*80)
    print("DETAILED BREAKDOWN BY ISSUE")
    print("="*80)
    print()
    
    for issue_name, nodes in sorted(by_issue.items()):
        print(f"\n{issue_name}:")
        print(f"  Total: {len(nodes)} nodes")
        
        # Count by type within this issue
        types_in_issue = {}
        for n in nodes:
            types_in_issue[n.node_type] = types_in_issue.get(n.node_type, 0) + 1
        
        print(f"  Types: {dict(types_in_issue)}\n")
        
        # Show first 5 nodes
        for i, node in enumerate(nodes, 1): #enumerate(nodes[:5], 1)
            attrs_str = f", attrs: {node.attributes if node.attributes else '[]'}"
            refs_str = f", refs: {node.reference_numbers[:2]}" if node.reference_numbers else ""
            base_str = f" → {node.base_stamp}" if node.base_stamp else ""
            
            print(f"  {i}. {node.catalog_number} ({node.node_type}/{node.sub_type}){base_str}")
            qty_str = f", qty: {node.quantity}" if node.quantity else ""
            print(f"     {node.denomination} {node.color or 'N/A'}{qty_str}{attrs_str}{refs_str}")
        
        # if len(nodes) > 5:
        #     print(f"  ... and {len(nodes) - 5} more")
    
    # Show relationships
    if relationships:
        print("\n" + "="*80)
        print("RELATIONSHIPS")
        print("="*80)
        print()
        
        # Group relationships by type
        by_rel_type = {}
        for rel in relationships:
            rel_type = rel['type']
            if rel_type not in by_rel_type:
                by_rel_type[rel_type] = []
            by_rel_type[rel_type].append(rel)
        
        for rel_type, rels in sorted(by_rel_type.items()):
            print(f"\n{rel_type} ({len(rels)} relationships):")
            for rel in rels[:5]:
                print(f"  {rel['from']:10s} --> {rel['to']}")
            if len(rels) > 5:
                print(f"  ... and {len(rels) - 5} more")
    
    print("\n" + "="*80)
    print(f"Results: {passed}/{len(checks)} checks passed")
    print("="*80 + "\n")
    
    # DEBUG: Mostrar nodos sin denominación
    nodes_without_denom = [n for n in all_nodes if not n.denomination]
    if nodes_without_denom:
        print("\n" + "="*80)
        print("DEBUG: NODES WITHOUT DENOMINATION")
        print("="*80)
        print()
        print(f"Found {len(nodes_without_denom)} nodes without denomination:\n")
        
        for i, node in enumerate(nodes_without_denom, 1):
            print(f"{i}. Catalog: {node.catalog_number}")
            print(f"   Type: {node.node_type}/{node.sub_type}")
            print(f"   Issue: {node.issue_name}")
            print(f"   Page: {node.page_number}, Order: {node.reading_order}")
            print(f"   Raw text: {node.raw_text[:100]}...")
            print(f"   Color: {node.color}")
            print()
        
    return passed, len(checks) - passed

In [None]:
import os
import json
import matplotlib.pyplot as plt
import networkx as nx
from matplotlib.patches import FancyBboxPatch
from collections import defaultdict
import pandas as pd

def create_stamp_graph(base_stamp_node, all_nodes, relationships, output_dir="./graph_outputs"):
    """
    Create a detailed graph for a single stamp and all its relationships.
    
    Args:
        base_stamp_node: The main stamp node
        all_nodes: List of all nodes
        relationships: List of all relationships
        output_dir: Directory to save graph images
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Create directed graph
    G = nx.DiGraph()
    
    # Find all related nodes
    related_nodes = set()
    related_relationships = []
    
    # Get the catalog number of the base stamp
    base_id = base_stamp_node.catalog_number
    
    # Add base stamp to graph
    qty_display = f"\nQty: {base_stamp_node.quantity}" if base_stamp_node.quantity else ""
    G.add_node(base_id, 
            node_obj=base_stamp_node,
            node_type=base_stamp_node.node_type,
            label=f"{base_id}\n{base_stamp_node.denomination}\n{base_stamp_node.color or ''}{qty_display}".strip())
    
    # Find all relationships involving this stamp
    for rel in relationships:
        if rel['from'] == base_id or rel['to'] == base_id:
            related_relationships.append(rel)
            related_nodes.add(rel['from'])
            related_nodes.add(rel['to'])
    
    # Add related nodes to graph
    node_lookup = {n.catalog_number: n for n in all_nodes}
    
    for node_id in related_nodes:
        if node_id != base_id and node_id in node_lookup:
            node = node_lookup[node_id]
            label_parts = [node_id]
            if node.denomination:
                label_parts.append(node.denomination)
            if node.color:
                label_parts.append(node.color)
            if node.node_type != 'stamp':
                label_parts.append(f"({node.node_type})")
            
            G.add_node(node_id, 
                      node_obj=node,
                      node_type=node.node_type,
                      label='\n'.join(label_parts))
    
    # Add edges with relationship types
    for rel in related_relationships:
        G.add_edge(rel['from'], rel['to'], 
                  rel_type=rel['type'],
                  label=rel['type'].replace('_', ' ').title())
    
    # Create figure with better size
    fig, ax = plt.subplots(1, 1, figsize=(14, 10))
    
    # Define node colors by type
    node_colors = {
        'stamp': '#4CAF50',        # Green for base stamps
        'variety': '#2196F3',       # Blue for varieties
        'proof': '#FF9800',         # Orange for proofs
        'specimen': '#9C27B0',      # Purple for specimens
        'error': '#F44336',         # Red for errors
        'overprint': '#00BCD4',     # Cyan for overprints
        'surcharge': '#FFEB3B',     # Yellow for surcharges
    }
    
    # Get colors for nodes
    colors = []
    for node_id in G.nodes():
        node_type = G.nodes[node_id].get('node_type', 'stamp')
        colors.append(node_colors.get(node_type, '#9E9E9E'))
    
    # Calculate layout - hierarchical for better visualization
    if len(G.nodes()) > 1:
        # Try hierarchical layout with base stamp at center
        pos = nx.spring_layout(G, k=2, iterations=50, seed=42)
        
        # Adjust position to put base stamp at center-top
        if base_id in pos:
            # Center the base stamp
            base_pos = pos[base_id]
            pos[base_id] = (0.5, 0.9)
            
            # Arrange related nodes in a semi-circle below
            related = [n for n in G.nodes() if n != base_id]
            if related:
                angle_step = 3.14 / (len(related) + 1)
                for i, node_id in enumerate(related):
                    angle = angle_step * (i + 1)
                    radius = 0.4
                    x = 0.5 + radius * np.cos(angle + 3.14)
                    y = 0.4 + radius * np.sin(angle + 3.14) * 0.6
                    pos[node_id] = (x, y)
    else:
        pos = {base_id: (0.5, 0.5)}
    
    # Draw the graph
    nx.draw_networkx_nodes(G, pos, 
                          node_color=colors, 
                          node_size=3000,
                          alpha=0.9,
                          ax=ax)
    
    # Draw labels with better formatting
    labels = nx.get_node_attributes(G, 'label')
    nx.draw_networkx_labels(G, pos, labels, 
                           font_size=8, 
                           font_weight='bold',
                           ax=ax)
    
    # Draw edges with labels
    nx.draw_networkx_edges(G, pos, 
                          edge_color='gray',
                          arrows=True,
                          arrowsize=20,
                          arrowstyle='-|>',
                          width=2,
                          alpha=0.6,
                          ax=ax)
    
    # Draw edge labels
    edge_labels = nx.get_edge_attributes(G, 'label')
    nx.draw_networkx_edge_labels(G, pos, edge_labels, 
                                font_size=7,
                                font_color='red',
                                ax=ax)
    
    # Add title with stamp details
    title_parts = [f"Stamp Network: {base_id}"]
    if base_stamp_node.issue_name:
        title_parts.append(f"Issue: {base_stamp_node.issue_name}")
    if base_stamp_node.year:
        title_parts.append(f"Year: {base_stamp_node.year}")
    
    plt.title('\n'.join(title_parts), fontsize=14, fontweight='bold')
    
    # Add legend
    legend_elements = []
    for node_type, color in node_colors.items():
        if any(G.nodes[n].get('node_type') == node_type for n in G.nodes()):
            legend_elements.append(plt.scatter([], [], c=color, s=100, 
                                              label=node_type.title()))
    
    if legend_elements:
        plt.legend(handles=legend_elements, loc='upper left', frameon=True)
    
    # Add statistics box
    stats_text = f"Nodes: {len(G.nodes())}\nRelationships: {len(G.edges())}"
    plt.text(0.02, 0.02, stats_text, transform=ax.transAxes,
            fontsize=10, verticalalignment='bottom',
            bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
    
    plt.axis('off')
    plt.tight_layout()
    
    # Save the figure
    safe_filename = base_id.replace('/', '_').replace(' ', '_')
    output_path = os.path.join(output_dir, f"stamp_{safe_filename}.png")
    plt.savefig(output_path, dpi=150, bbox_inches='tight')
    plt.close()
    
    return output_path, G


def create_comparative_table(base_stamp_node, all_nodes, relationships, output_dir="./graph_outputs"):
    """
    Create a comparative table showing the stamp and all its variants/related items.
    """
    os.makedirs(output_dir, exist_ok=True)
    
    base_id = base_stamp_node.catalog_number
    related_data = []
    
    # Add base stamp
    related_data.append({
        'Catalog #': base_id,
        'Type': base_stamp_node.node_type,
        'Denomination': base_stamp_node.denomination,
        'Color': base_stamp_node.color or 'N/A',
        'Quantity': base_stamp_node.quantity or 'N/A',
        'Year': base_stamp_node.year or 'N/A',
        'Attributes': ', '.join(base_stamp_node.attributes) if base_stamp_node.attributes else 'N/A',
        'References': ', '.join(base_stamp_node.reference_numbers) if base_stamp_node.reference_numbers else 'N/A',
        'Relationship': 'BASE STAMP'
    })
    
    # Find all related nodes
    node_lookup = {n.catalog_number: n for n in all_nodes}
    
    for rel in relationships:
        if rel['from'] == base_id:
            if rel['to'] in node_lookup:
                node = node_lookup[rel['to']]
                related_data.append({
                    'Catalog #': node.catalog_number,
                    'Type': node.node_type,
                    'Denomination': node.denomination or 'N/A',
                    'Color': node.color or 'N/A',
                    'Year': node.year or 'N/A',
                    'Attributes': ', '.join(node.attributes) if node.attributes else 'N/A',
                    'References': ', '.join(node.reference_numbers) if node.reference_numbers else 'N/A',
                    'Relationship': rel['type'].replace('_', ' ').upper()
                })
    
    # Create DataFrame
    df = pd.DataFrame(related_data)
    
    # Create figure with table
    fig, ax = plt.subplots(figsize=(14, max(4, len(related_data) * 0.5)))
    ax.axis('tight')
    ax.axis('off')
    
    # Create table
    table = ax.table(cellText=df.values, 
                    colLabels=df.columns,
                    cellLoc='left',
                    loc='center',
                    colWidths=[0.10, 0.09, 0.11, 0.10, 0.09, 0.08, 0.17, 0.13, 0.10])
    
    table.auto_set_font_size(False)
    table.set_fontsize(9)
    table.scale(1, 1.5)
    
    # Style the table
    for i in range(len(df.columns)):
        table[(0, i)].set_facecolor('#4CAF50')
        table[(0, i)].set_text_props(weight='bold', color='white')
    
    # Highlight base stamp row
    table[(1, 0)].set_facecolor('#E8F5E9')
    for i in range(len(df.columns)):
        table[(1, i)].set_facecolor('#E8F5E9')
    
    # Add title
    title = f"Stamp Catalog Comparison: {base_id}"
    if base_stamp_node.issue_name:
        title += f"\nIssue: {base_stamp_node.issue_name}"
    plt.title(title, fontsize=12, fontweight='bold', pad=20)
    
    # Save
    safe_filename = base_id.replace('/', '_').replace(' ', '_')
    output_path = os.path.join(output_dir, f"table_{safe_filename}.png")
    plt.savefig(output_path, dpi=150, bbox_inches='tight')
    plt.close()
    
    return output_path


def enhanced_test_full_integration():
    """Enhanced version with graph visualization per stamp"""
    
    print("="*80)
    print("ENHANCED TEST WITH GRAPH VISUALIZATION")
    print("="*80)
    print()
    
    input_path = "./results/recognition_json/Mena 2018 CRPC .json"
    
    # [Previous loading code remains the same...]
    if not os.path.exists(input_path):
        print(f"⚠️  File not found: {input_path}")
        return
    
    with open(input_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    if isinstance(data, dict) and 'pages' in data:
        all_pages = data['pages']
    elif isinstance(data, list):
        all_pages = data
    else:
        print("⚠️  Unexpected JSON structure")
        return
    
    # Filter pages as before
    sample_data = [p for p in all_pages if p.get('page_number') in [8, 9]]
    
    # Process with your extractor
    extractor = PhilatelicExtractor()
    
    all_nodes = []
    for page in sample_data:
        nodes = extractor.process_page(page)
        all_nodes.extend(nodes)
    
    # Build relationships
    relationships = extractor.build_relationships(all_nodes)
    
    # Find all base stamps (stamps without base_stamp reference)
    base_stamps = [n for n in all_nodes if n.node_type == 'stamp' and not n.base_stamp]
    
    print(f"Found {len(base_stamps)} base stamps to visualize")
    print("Creating individual graphs for each stamp...\n")
    
    # Create graphs for each base stamp
    graph_paths = []
    table_paths = []
    
    for i, stamp in enumerate(base_stamps, 1):
        print(f"Processing {i}/{len(base_stamps)}: {stamp.catalog_number} - {stamp.issue_name}")
        
        # Create network graph
        graph_path, G = create_stamp_graph(stamp, all_nodes, relationships)
        graph_paths.append(graph_path)
        print(f"  ✅ Graph saved: {graph_path}")
        
        # Create comparative table
        table_path = create_comparative_table(stamp, all_nodes, relationships)
        table_paths.append(table_path)
        print(f"  ✅ Table saved: {table_path}")
        
        # Print graph statistics
        print(f"  📊 Graph stats: {len(G.nodes())} nodes, {len(G.edges())} relationships")
        print()
    
    # Create summary HTML file for easy viewing
    create_html_summary(base_stamps, graph_paths, table_paths)
    
    print("="*80)
    print("VISUALIZATION COMPLETE")
    print(f"Generated {len(graph_paths)} stamp graphs")
    print(f"Generated {len(table_paths)} comparison tables")
    print("Check ./graph_outputs/ directory for results")
    print("Open summary.html for easy navigation")
    print("="*80)


def create_html_summary(base_stamps, graph_paths, table_paths, output_dir="./graph_outputs"):
    """Create an HTML file to easily view all generated graphs"""
    
    html_content = """
    <!DOCTYPE html>
    <html>
    <head>
        <title>Philatelic Catalog Verification</title>
        <style>
            body { font-family: Arial, sans-serif; margin: 20px; }
            h1 { color: #333; }
            .stamp-section { 
                border: 2px solid #ddd; 
                margin: 20px 0; 
                padding: 15px; 
                border-radius: 5px;
            }
            .stamp-header { 
                background: #f5f5f5; 
                padding: 10px; 
                margin: -15px -15px 15px -15px;
                border-radius: 3px 3px 0 0;
            }
            .images { display: flex; gap: 20px; }
            .image-container { flex: 1; text-align: center; }
            img { max-width: 100%; border: 1px solid #ccc; }
            .navigation { 
                position: fixed; 
                right: 20px; 
                top: 20px; 
                background: white; 
                border: 1px solid #ddd;
                padding: 10px;
                max-height: 80vh;
                overflow-y: auto;
            }
        </style>
    </head>
    <body>
        <h1>Philatelic Catalog Verification Graphs</h1>
        <div class="navigation">
            <h3>Quick Navigation</h3>
            <ul>
    """
    
    # Add navigation links
    for i, stamp in enumerate(base_stamps):
        safe_id = stamp.catalog_number.replace('/', '_').replace(' ', '_')
        html_content += f'<li><a href="#{safe_id}">{stamp.catalog_number}</a></li>\n'
    
    html_content += """
            </ul>
        </div>
        <div style="margin-right: 200px;">
    """
    
    # Add stamp sections
    for i, stamp in enumerate(base_stamps):
        safe_id = stamp.catalog_number.replace('/', '_').replace(' ', '_')
        graph_filename = os.path.basename(graph_paths[i])
        table_filename = os.path.basename(table_paths[i])
        
        html_content += f"""
        <div class="stamp-section" id="{safe_id}">
            <div class="stamp-header">
                <h2>{stamp.catalog_number}</h2>
                <p><strong>Issue:</strong> {stamp.issue_name or 'N/A'}</p>
                <p><strong>Denomination:</strong> {stamp.denomination or 'N/A'}</p>
                <p><strong>Color:</strong> {stamp.color or 'N/A'}</p>
                <p><strong>Year:</strong> {stamp.year or 'N/A'}</p>
            </div>
            <div class="images">
                <div class="image-container">
                    <h3>Relationship Graph</h3>
                    <img src="{graph_filename}" alt="Graph for {stamp.catalog_number}">
                </div>
                <div class="image-container">
                    <h3>Comparison Table</h3>
                    <img src="{table_filename}" alt="Table for {stamp.catalog_number}">
                </div>
            </div>
        </div>
        """
    
    html_content += """
        </div>
    </body>
    </html>
    """
    
    # Save HTML file
    html_path = os.path.join(output_dir, "summary.html")
    with open(html_path, 'w', encoding='utf-8') as f:
        f.write(html_content)
    
    print(f"✅ HTML summary created: {html_path}")


# Additional utility function for verification
def export_verification_csv(all_nodes, relationships, output_path="./graph_outputs/verification.csv"):
    """Export all data to CSV for manual verification against catalog"""
    
    data = []
    for node in all_nodes:
        # Find relationships for this node
        related_to = []
        for rel in relationships:
            if rel['from'] == node.catalog_number:
                related_to.append(f"{rel['to']} ({rel['type']})")
        
        data.append({
            'Catalog_Number': node.catalog_number,
            'Type': node.node_type,
            'SubType': node.sub_type,
            'Issue': node.issue_name,
            'Year': node.year,
            'Denomination': node.denomination,
            'Color': node.color,
            'Attributes': ', '.join(node.attributes) if node.attributes else '',
            'References': ', '.join(node.reference_numbers) if node.reference_numbers else '',
            'Base_Stamp': node.base_stamp or '',
            'Related_To': '; '.join(related_to),
            'Page': node.page_number,
            'Reading_Order': node.reading_order
        })
    
    df = pd.DataFrame(data)
    df.to_csv(output_path, index=False)
    print(f"✅ Verification CSV exported: {output_path}")
    
    return df

In [None]:
"""Run all expert validation tests"""
    
print("\n")
print("╔" + "="*78 + "╗")
print("║" + " "*15 + "EXPERT VALIDATION TEST SUITE - COSTA RICA" + " "*22 + "║")
print("║" + " "*22 + "Philately & Regex Expert Edition" + " "*24 + "║")
print("╚" + "="*78 + "╝")
print()

results = []

# Test 1: Denominations
p1, f1 = test_denomination_extraction()
results.append(('Denomination Extraction', p1, f1))

# Test 2: Issue names
p2, f2 = test_issue_name_tracking()
results.append(('Issue Name Tracking', p2, f2))

# Test 3: Quantity vs Denomination
p3, f3 = test_quantity_vs_denomination()
results.append(('Quantity vs Denomination', p3, f3))

# Test 4: Color extraction
p4, f4 = test_color_extraction_expert()
results.append(('Expert Color Extraction', p4, f4))

# Test 5: Full integration
p5, f5 = test_full_integration()
results.append(('Full Integration', p5, f5))

# Final summary
print("\n" + "="*80)
print("FINAL SUMMARY")
print("="*80)
print()

total_passed = sum(r[1] for r in results)
total_tests = sum(r[1] + r[2] for r in results)

for name, passed, failed in results:
    total = passed + failed
    pct = (passed / total * 100) if total > 0 else 0
    status = '✅' if failed == 0 else '⚠️ '
    print(f"{status} {name:30s}: {passed:2d}/{total:2d} ({pct:5.1f}%)")

print(f"\n{'='*80}")
print(f"OVERALL: {total_passed}/{total_tests} tests passed ({total_passed/total_tests*100:.1f}%)")
print(f"{'='*80}\n")

if total_passed == total_tests:
    print("🎉 ALL TESTS PASSED!")
    print("\n✅ El extractor está listo para procesar el catálogo completo de Costa Rica")
    print("✅ Maneja todas las denominaciones: centavos, colones, pesos, reales")
    print("✅ Extrae issue names correctamente de elementos <sec>")
    print("✅ Distingue cantidades de denominaciones")
    print("✅ Extrae colores sin confundirlos con overprints")
    print("\n🚀 Próximo paso: Ejecutar en páginas 30-31 completas")
else:
    print(f"⚠️  {total_tests - total_passed} tests fallaron")
    print("Revisa los detalles arriba para corregir los problemas.")

In [None]:
enhanced_test_full_integration()

## Watson Test

In [None]:
import json
from landingai_ade import LandingAIADE
# Load environment variables 
from dotenv import load_dotenv
load_dotenv()
import re

In [None]:
import nest_asyncio, asyncio, logging
nest_asyncio.apply()                        # permite reusar el event loop de Jupyter
logging.getLogger('asyncio').setLevel(logging.CRITICAL)

from pydantic import BaseModel, Field
from typing import List
from beeai_framework.backend import ChatModel, ChatModelParameters, UserMessage, SystemMessage

# ---- Modelo Pydantic para salida estructurada ----
class BusinessPlan(BaseModel):
    business_name: str = Field(description="Catchy name for the business")
    elevator_pitch: str = Field(description="30-second description of the business")
    target_market: str = Field(description="Primary target audience")
    unique_value_proposition: str = Field(description="What makes this business special")
    revenue_streams: List[str] = Field(description="Ways the business will make money")
    startup_costs: str = Field(description="Estimated initial investment needed")
    key_success_factors: List[str] = Field(description="Critical elements for success")

# ---- Función async original (no la cambies) ----
async def structured_output_example():
    llm = ChatModel.from_name("openai:gpt-5-nano", ChatModelParameters(temperature=0))
    
    messages = [
        SystemMessage(content="You are an expert business consultant and entrepreneur."),
        UserMessage(content="Create a business plan for a mobile app that helps people find and book unique local experiences in their city.")
    ]
    
    response = await llm.create_structure(
        schema=BusinessPlan,
        messages=messages
    )
    print("User: Create a business plan for a mobile app that helps people find and book unique local experiences in their city.")
    print('*' * 50)  
    print("\n🚀 AI-Generated Business Plan:")
    print(f"💡 Business Name: {response.object['business_name']}")
    print(f"🎯 Elevator Pitch: {response.object['elevator_pitch']}")
    print(f"👥 Target Market: {response.object['target_market']}")
    print(f"⭐ Unique Value Proposition: {response.object['unique_value_proposition']}")
    print(f"💰 Revenue Streams: {', '.join(response.object['revenue_streams'])}")
    print(f"💵 Startup Costs: {response.object['startup_costs']}")
    print("🔑 Key Success Factors:")
    for factor in response.object['key_success_factors']:
        print(f"  - {factor}")

# ---- En Jupyter: usa await (NO asyncio.run) ----
await structured_output_example()


In [None]:
# pip install ibm-watsonx-ai python-dotenv
from os import getenv
from ibm_watsonx_ai.foundation_models import ModelInference
from ibm_watsonx_ai import Credentials

model = ModelInference(
    model_id="meta-llama/llama-4-maverick-17b-128e-instruct-fp8",
    credentials=Credentials(
        url=getenv("WATSONX_URL"),
        api_key=getenv("WATSONX_API_KEY"),
    ),
    project_id=getenv("WATSONX_PROJECT_ID"),
    params={"temperature": 0, "max_new_tokens": 8192},
)

resp = model.generate(prompt="[Tu catálogo Costa Rica aquí]")
print(resp["results"][0]["generated_text"])


## Get the Catalogues with Landing AI

In [None]:
import json
from landingai_ade import LandingAIADE
# Load environment variables 
from dotenv import load_dotenv
load_dotenv()
import re
import os

In [None]:
# PDF PATH
pdf_path = "./pdfs/Catalogues/"
pdf_file_name = "Mena 2014 51-100"

## Parse Doc using Landing AI

In [None]:
# Parse the document
response = LandingAIADE().parse(document_url=pdf_path+pdf_file_name+".pdf",model="dpt-2-latest")


In [None]:
# Print the results
print("Extracted Markdown:")
print(response.markdown)
print("Extracted Chunks:")
print(response.chunks)

# Save Markdown to a file
if response.markdown:
    with open(f'results/parsed_catalogues/{pdf_file_name}.md', 'w', encoding='utf-8') as f:
        f.write(response.markdown)
    print("\nMarkdown content saved to a Markdown file.")
else:
    print("No 'markdown' field found in the response")
    
# Save Chunks to a JSON file
if response.chunks:
    # Convertir chunks a diccionarios para serialización JSON
    chunks_data = [chunk.model_dump() for chunk in response.chunks]
    
    with open(f'results/parsed_catalogues/{pdf_file_name}_chunks.json', 'w', encoding='utf-8') as f:
        json.dump(chunks_data, f, ensure_ascii=False, indent=2)
    print(f"\n{len(chunks_data)} chunks saved to JSON file.")
else:
    print("No 'chunks' field found in the response")

### Leyendo el archivo de Landing AI

In [None]:
group_chunks = []
with open(f'results/parsed_catalogues/{pdf_file_name}_chunks.json', 'r', encoding='utf-8') as f:
    chunks_data = json.load(f)
    print(f"Se cargaron {len(chunks_data)} chunks desde el archivo JSON.\n")
    group_chunks = chunks_data

    

### Logica para agrupar Mena

In [None]:
def clean_chunk_text(text: str) -> str:
    """Clean markdown and formatting artifacts"""
    # Remove anchor tags
    text = re.sub(r'<a id=[\'"][^\'"]+[\'"]></a>', '', text)
    return text

def get_header(texto):
    patron_1 = r'\n\n(.*?)\n'
    match = re.search(patron_1, texto)
    if match:
        return match.group(1).strip()
    
    patron_2 = r'\n\n\*\*(.*?)\*\*' ##\n\n**Plasma Technology Export issue**
    match = re.search(patron_2, texto)
    if match:
        return match.group(1).strip()    
    
    return None

def has_year(texto):
    """
    Detecta si el texto contiene años entre 1862 y 2025
    Devuelve True si encuentra al menos un año, False si no
    """
    # Buscar todos los números de 4 dígitos
    patron = r'\b(18[6-9]\d|19\d{2}|20[0-1]\d|202[0-5])\b'
    match = re.search(patron, texto)
    return match is not None

def has_issue(texto):
    """
    Retorna True si contiene "issue" pero NO si es "regular issue"
    """
    texto_lower = texto.lower()
    
    # Si contiene "regular issue", retornar False
    if "regular issue" in texto_lower:
        return False
    
    # Si contiene "issue" (pero ya sabemos que no es "regular issue")
    return "issue" in texto_lower
def has_guanacaste_type(texto):
    texto_lower = texto.lower()
    return "guanacaste type" in texto_lower

def has_prestamp_postmark(texto):
    texto_lower = texto.lower()
    return "pre stamp postmark" in texto_lower

def has_receptoria(texto):
    texto_lower = texto.lower()
    return "receptoria" in texto_lower
def has_postal_related_revenue(texto):
    texto_lower = texto.lower()
    return "postal related revenue" in texto_lower

In [None]:
def eliminar_texto_entre_marcadores(texto):
    """
    Elimina todo el texto que esté entre <:: y ::>
    """
    # Patrón para encontrar <:: cualquier texto ::>
    # El ? hace que sea no-greedy (no codicioso)
    patron = r'<::.*?::>'
    
    # Reemplaza todas las ocurrencias con cadena vacía
    texto_limpio = re.sub(patron, '', texto, flags=re.DOTALL)
    
    return texto_limpio

def extraer_texto_entre_marcadores(texto):
    """
    Extrae todo el texto que esté entre <:: y ::>
    Retorna una lista con todos los textos encontrados
    """
    # Patrón para encontrar <:: cualquier texto ::>
    patron = r'<::(.*?)::>'
    
    # Encuentra todas las coincidencias
    textos_extraidos = re.findall(patron, texto, flags=re.DOTALL)
    
    return textos_extraidos

In [None]:
# group_chunks = []
# with open(f'results/parsed_catalogues/{pdf_file_name}_chunks.json', 'r', encoding='utf-8') as f:
#     chunks_data = json.load(f)
#     print(f"Se cargaron {len(chunks_data)} chunks desde el archivo JSON.\n")
#     group_chunks = chunks_data


# grouped_chunks = []
# current_group = None

# special_cases_1_50 = ['Overprint "Correos" on revenue stamps',"Issue of 1903",'Surcharges on 1923 stamps','Surcharge on 1923 stamp','**Bar Cancels**','**"Un Centimo" Surcharge**','Surcharge "Correos/5 Centimos"','Overprint "Compre Ud. Cafe de Costa Rica" in circle']
# special_cases_51_100 = ['Surcharge on Mauro Fernandez stamp',"Surcharges on revenue stamps",'Surcharges on Christmas tax stamps','**Heroes 1856 Campaign issue**','**Orchids issue**']
# keywords = ["ELECTORAL STAMPS", "TELEGRAPH SEALS", "RADIOGRAM SEALS", 
#             "MISCELLANEOUS", "UNAPPROVED ESSAYS", 
#             "INTERNATIONAL REPLY COUPONS", "NON OFFICIAL",
#             "OFFICIAL POSTAL SEALS", "REGISTRATION LABELS","ROSS FANTASIES","First Issue Fantasies","1901 Overprint Fantasy"]
# for i, chunk in enumerate(chunks_data):
    
#     is_text_or_table = chunk.get('type') in ["text","table"]
#     is_fig= chunk.get('type') in ["figure"]
#     chunk_text = clean_chunk_text(chunk.get('markdown', 'N/A'))
#     chunk_page = chunk.get('grounding', 'N/A')['page']
    
#     # Verificar si es un header válido
#     header_found = None
#     if is_text_or_table and chunk_text.startswith("\n\n"):
#         chunk_text = chunk_text.replace("\n\nHR Mena                                 SURFACE MAIL                                Costa Rica Postal Catalogue\n\n","\n\n") ##BUG FIX
#         chunk_text = chunk_text.replace("\n\nHR Mena\nSURFACE MAIL\nCosta Rica Postal Catalogue\n\n","\n\n") ##BUG FIX
#         chunk_text = chunk_text.replace("\n\nHR Mena\nSURFACE MAIL Costa Rica Post\n","\n\n") ##BUG FIX     
#         chunk_text = chunk_text.replace("\n\nHR Mena\n\nSURFACE MAIL\n\nCosta Rica Postal Catalogue\n\n","\n\n") ##BUG FIX    
#         header_candidate = get_header(chunk_text)
#         if header_candidate:
#             # Validar que no sea uno de los casos excluidos
#             if header_candidate.lower() == "costa rica postal catalogue":
#                 pass
#             elif "Essay" in header_candidate: ##UN BUG
#                     pass
#             elif "Essays," in header_candidate: ##UN BUG
#                 pass
#             elif "Decree," in header_candidate: ##UN BUG
#                 pass
#             elif "Decree" in header_candidate: ##UN BUG
#                 pass
#             elif "All stamps" in header_candidate: ##UN BUG
#                 pass
#             elif has_guanacaste_type(header_candidate): ##CASO GUANACASTE TYPES
#                 header_found = header_candidate
#                 print(header_candidate,"--> in page:",chunk_page)
#             elif has_prestamp_postmark(header_candidate): ##CASO PRE STAMP POSTMARK
#                 header_found = header_candidate
#                 print(header_candidate,"--> in page:",chunk_page)
#             elif has_receptoria(header_candidate): ##CASO RECEPTORIA
#                 header_found = header_candidate
#                 print(header_candidate,"--> in page:",chunk_page)
#             elif has_postal_related_revenue(header_candidate): ##CASO POSTAL RELATED REVENUE
#                 header_found = header_candidate
#                 print(header_candidate,"--> in page:",chunk_page)
#             elif any(keyword in header_candidate for keyword in keywords):
#                 header_found = header_candidate
#                 print(f"{header_candidate} --> in page: {chunk_page}")                  
#             elif header_candidate in special_cases_1_50: ##CASO ESPECIAL 1-50
#                 header_found = header_candidate
#                 print(header_candidate,"--> in page:",chunk_page)
#             elif header_candidate in special_cases_51_100: ##CASO ESPECIAL 51-100
#                 header_found = header_candidate
#                 print(header_candidate,"--> in page:",chunk_page)
#             elif has_year(header_candidate) or has_issue(header_candidate):
#                 if len(header_candidate) > 10: ##CONSIDERARLO
#                     pass
#                 header_found = header_candidate
#                 print(header_candidate,"--> in page:",chunk_page)
#             elif len(header_candidate) > 5: ## ESTO NO ES UN HEADER
#                 pass
#     if is_fig and chunk_text.startswith("\n\n"):
#         plain_fig = chunk_text.startswith("\n\n<::")
#         if not plain_fig:            
#             chunk_text = chunk_text.replace("\n\nHR Mena                                 SURFACE MAIL                                Costa Rica Postal Catalogue\n\n","\n\n") ##BUG FIX
#             chunk_text = chunk_text.replace("\n\nHR Mena\nSURFACE MAIL\nCosta Rica Postal Catalogue\n\n","\n\n") ##BUG FIX     
#             header_candidate = get_header(chunk_text) 
#             if header_candidate:
#                 if has_year(header_candidate) or has_issue(header_candidate):
#                     if len(header_candidate) > 10: ##CONSIDERARLO
#                         pass
#                     header_found = header_candidate
#                     print(header_candidate,"--> in page:",chunk_page) 

#     # Si encontramos un header válido
#     if header_found:
#         # Guardar el grupo anterior si existe
#         if current_group is not None:
#             grouped_chunks.append(current_group)
        
#         # Crear un nuevo grupo con este header
#         current_group = {
#             "header": header_found,
#             "chunks": [chunk]
#         }
#     else:
#         # Si no es header, agregar al grupo actual (si existe)
#         if current_group is not None:
#             current_group["chunks"].append(chunk)

# # No olvidar agregar el último grupo
# if current_group is not None:
#     grouped_chunks.append(current_group)

# # Ahora grouped_chunks contiene todos los grupos
# print(f"\nTotal de grupos creados: {len(grouped_chunks)}")

In [None]:
def get_grouped_chunks(group_chunks):
    grouped_chunks = []
    current_group = None

    special_cases_1_50 = ['Overprint "Correos" on revenue stamps',"Issue of 1903",'Surcharges on 1923 stamps','Surcharge on 1923 stamp','**Bar Cancels**','**"Un Centimo" Surcharge**','Surcharge "Correos/5 Centimos"','Overprint "Compre Ud. Cafe de Costa Rica" in circle']
    special_cases_51_100 = ['Surcharge on Mauro Fernandez stamp',"Surcharges on revenue stamps",'Surcharges on Christmas tax stamps','**Heroes 1856 Campaign issue**','**Orchids issue**']
    keywords = ["ELECTORAL STAMPS", "TELEGRAPH SEALS", "RADIOGRAM SEALS", 
                "MISCELLANEOUS", "UNAPPROVED ESSAYS", 
                "INTERNATIONAL REPLY COUPONS", "NON OFFICIAL",
                "OFFICIAL POSTAL SEALS", "REGISTRATION LABELS","ROSS FANTASIES","First Issue Fantasies","1901 Overprint Fantasy"]
    for i, chunk in enumerate(group_chunks):
        
        is_text_or_table = chunk.get('type') in ["text","table"]
        is_fig= chunk.get('type') in ["figure"]
        chunk_text = clean_chunk_text(chunk.get('markdown', 'N/A'))
        chunk_page = chunk.get('grounding', 'N/A')['page']
        
        # Verificar si es un header válido
        header_found = None
        if is_text_or_table and chunk_text.startswith("\n\n"):
            chunk_text = chunk_text.replace("\n\nHR Mena                                 SURFACE MAIL                                Costa Rica Postal Catalogue\n\n","\n\n") ##BUG FIX
            chunk_text = chunk_text.replace("\n\nHR Mena\nSURFACE MAIL\nCosta Rica Postal Catalogue\n\n","\n\n") ##BUG FIX
            chunk_text = chunk_text.replace("\n\nHR Mena\nSURFACE MAIL Costa Rica Post\n","\n\n") ##BUG FIX     
            chunk_text = chunk_text.replace("\n\nHR Mena\n\nSURFACE MAIL\n\nCosta Rica Postal Catalogue\n\n","\n\n") ##BUG FIX    
            header_candidate = get_header(chunk_text)
            if header_candidate:
                # Validar que no sea uno de los casos excluidos
                if header_candidate.lower() == "costa rica postal catalogue":
                    pass
                elif "Essay" in header_candidate: ##UN BUG
                        pass
                elif "Essays," in header_candidate: ##UN BUG
                    pass
                elif "Decree," in header_candidate: ##UN BUG
                    pass
                elif "Decree" in header_candidate: ##UN BUG
                    pass
                elif "All stamps" in header_candidate: ##UN BUG
                    pass
                elif has_guanacaste_type(header_candidate): ##CASO GUANACASTE TYPES
                    header_found = header_candidate
                    #print(header_candidate,"--> in page:",chunk_page)
                elif has_prestamp_postmark(header_candidate): ##CASO PRE STAMP POSTMARK
                    header_found = header_candidate
                    #print(header_candidate,"--> in page:",chunk_page)
                elif has_receptoria(header_candidate): ##CASO RECEPTORIA
                    header_found = header_candidate
                    #print(header_candidate,"--> in page:",chunk_page)
                elif has_postal_related_revenue(header_candidate): ##CASO POSTAL RELATED REVENUE
                    header_found = header_candidate
                    #print(header_candidate,"--> in page:",chunk_page)
                elif any(keyword in header_candidate for keyword in keywords):
                    header_found = header_candidate
                    #print(f"{header_candidate} --> in page: {chunk_page}")                  
                elif header_candidate in special_cases_1_50: ##CASO ESPECIAL 1-50
                    header_found = header_candidate
                    #print(header_candidate,"--> in page:",chunk_page)
                elif header_candidate in special_cases_51_100: ##CASO ESPECIAL 51-100
                    header_found = header_candidate
                    #print(header_candidate,"--> in page:",chunk_page)
                elif has_year(header_candidate) or has_issue(header_candidate):
                    if len(header_candidate) > 10: ##CONSIDERARLO
                        pass
                    header_found = header_candidate
                    #print(header_candidate,"--> in page:",chunk_page)
                elif len(header_candidate) > 5: ## ESTO NO ES UN HEADER
                    pass
        if is_fig and chunk_text.startswith("\n\n"):
            plain_fig = chunk_text.startswith("\n\n<::")
            if not plain_fig:            
                chunk_text = chunk_text.replace("\n\nHR Mena                                 SURFACE MAIL                                Costa Rica Postal Catalogue\n\n","\n\n") ##BUG FIX
                chunk_text = chunk_text.replace("\n\nHR Mena\nSURFACE MAIL\nCosta Rica Postal Catalogue\n\n","\n\n") ##BUG FIX     
                header_candidate = get_header(chunk_text) 
                if header_candidate:
                    if has_year(header_candidate) or has_issue(header_candidate):
                        if len(header_candidate) > 10: ##CONSIDERARLO
                            pass
                        header_found = header_candidate
                        #print(header_candidate,"--> in page:",chunk_page) 

        # Si encontramos un header válido
        if header_found:
            # Guardar el grupo anterior si existe
            if current_group is not None:
                grouped_chunks.append(current_group)
            
            # Crear un nuevo grupo con este header
            current_group = {
                "header": header_found,
                "chunks": [chunk]
            }
        else:
            # Si no es header, agregar al grupo actual (si existe)
            if current_group is not None:
                current_group["chunks"].append(chunk)

    # No olvidar agregar el último grupo
    if current_group is not None:
        grouped_chunks.append(current_group)

    # Ahora grouped_chunks contiene todos los grupos
    print(f"\nTotal de grupos creados: {len(grouped_chunks)}")
    
    return grouped_chunks

In [None]:
pdf_path = "results/parsed_catalogues/"

# Obtener lista de archivos que cumplen los criterios
archivos_mena = [
    archivo for archivo in os.listdir(pdf_path)
    if archivo.startswith("Mena 2014") and archivo.endswith(".json")
]

print(archivos_mena)

In [None]:
archivos_mena = ['Mena 2014 1-50_chunks.json', 'Mena 2014 51-100_chunks.json', 'Mena 2014 101-150_chunks.json', 'Mena 2014 151-200_chunks.json', 'Mena 2014 201-250_chunks.json', 'Mena 2014 251-300_chunks.json', 'Mena 2014 301-315_chunks.json']

In [None]:
total_grouped_chunks = []
for mena_chunks_file in archivos_mena:
    group_chunks = []
    with open(f'results/parsed_catalogues/{mena_chunks_file}', 'r', encoding='utf-8') as f:
        chunks_data = json.load(f)
        group_chunks = chunks_data
    mena_file_groups = get_grouped_chunks(group_chunks)
    total_grouped_chunks.extend(mena_file_groups)
    print(mena_chunks_file,"-->",len(mena_file_groups))
print("Total Grouped Chunks:",len(total_grouped_chunks))

In [None]:
final_test_text = ""
for grouped_chunk in total_grouped_chunks[0:1]:
    #print(grouped_chunk['header'])
    chunks = grouped_chunk['chunks']
    final_text = ""
    for chunk in chunks:
        if chunk['type'] == 'attestation' or chunk['type'] == 'marginalia':
            pass
        else:
            final_text += clean_chunk_text(chunk['markdown'])
    print("------- TEXTO FIGURAS ----------")
    print(extraer_texto_entre_marcadores(final_text))
    print("------- PLAIN TEXT -------------")
    final_test_text = eliminar_texto_entre_marcadores(final_text)
    print(final_test_text)
    break

### Logica de Agrupamiento Scott

In [None]:
import re
def _clean_chunk_text(text: str) -> str:
    """Clean markdown and formatting artifacts"""
    # Remove anchor tags
    text = re.sub(r'<a id=[\'"][^\'"]+[\'"]></a>\n*', '', text)
    # Remove figure markup but keep content
    #text = re.sub(r'<::(.*?)::>', r'\1', text, flags=re.DOTALL)
    # Clean excessive whitespace
    text = re.sub(r'\n{3,}', '\n\n', text)
    return text.strip()

In [None]:
# from icecream import ic
# for i,group_chunk in enumerate(group_chunks):
#     if len(group_chunk) == 1:
#         print("Group Number ",i)
#         for chunk in group_chunk:
#             print("SOLUTION")
#             chunk_solution = group_chunks[i-1][-1]
#             print(f"Tipo: {chunk_solution.get('type', 'N/A')}")
#             print(f"Texto: {_clean_chunk_text(chunk_solution.get('markdown', 'N/A'))}")
#             print(f"Página: {chunk_solution.get('grounding', 'N/A')['page']}")
#             print("-------END SOLUTION----------")
#             print()                
#             print(f"Tipo: {chunk.get('type', 'N/A')}")
#             print(f"Texto: {_clean_chunk_text(chunk.get('markdown', 'N/A'))}")
#             print(f"Página: {chunk.get('grounding', 'N/A')['page']}")
#             print("---------------")
#         print("****************************************************")

In [None]:
from icecream import ic

# Crear el nuevo arreglo fusionado
group_chunks_merged = []

for i, group_chunk in enumerate(group_chunks):
    # Si es el primer grupo, simplemente lo agregamos
    if i == 0:
        group_chunks_merged.append(group_chunk[:])  # Copia del grupo
    # Si el grupo actual tiene solo 1 elemento
    elif len(group_chunk) == 1:
        # Obtenemos el grupo anterior del nuevo arreglo (ya procesado)
        previous_group = group_chunks_merged[-1]
        
        # Si el grupo anterior tiene elementos
        if len(previous_group) > 0:
            # Extraemos el último elemento del grupo anterior
            chunk_solution = previous_group[-1]
            
            # Removemos ese elemento del grupo anterior en el nuevo arreglo
            group_chunks_merged[-1] = previous_group[:-1]
            
            # Creamos el nuevo grupo fusionado: [chunk_solution, chunk_actual]
            merged_group = [chunk_solution, group_chunk[0]]
            group_chunks_merged.append(merged_group)
            
            print(f"✓ Grupo {i} fusionado con último elemento del grupo {i-1}")
        else:
            # Si el grupo anterior ya está vacío, solo agregamos el actual
            group_chunks_merged.append(group_chunk[:])
    else:
        # Si tiene más de 1 elemento, lo agregamos tal cual
        group_chunks_merged.append(group_chunk[:])

# Limpiamos grupos vacíos si existen
group_chunks_merged = [group for group in group_chunks_merged if len(group) > 0]


In [None]:
from icecream import ic
for i,group_chunk in enumerate(group_chunks_merged[0:30]):    
    print("Group Number ",i)
    for chunk in group_chunk:
        print(f"Tipo: {chunk.get('type', 'N/A')}")
        print(f"Texto: {_clean_chunk_text(chunk.get('markdown', 'N/A'))}")
        print(f"Página: {chunk.get('grounding', 'N/A')['page']}")
        print("---------------")
    print("****************************************************")

In [None]:
len(group_chunks_merged)

## Parseando el Catalogo Scott con LLM 

In [None]:
# Import your stamp models (assuming they're in a file called 'kg_pydantic.py')
# from stamp_models import ScottEntry, ScottNumber, Denomination, ColorDescription, Perforation, MonetaryValue, PrintingMethod, StampType, PaperType
from kg_pydantic import *
# Load environment variables 
from dotenv import load_dotenv
load_dotenv()

In [None]:
"""
Simplified Scott Catalog Parser with Direct Examples
This version uses a more straightforward approach with explicit examples
"""

import os
import json
import re
from typing import List, Dict, Any, Optional
from decimal import Decimal
from datetime import datetime

from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate, FewShotChatMessagePromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain_community.callbacks import get_openai_callback



class SimpleScottParser:
    """Simplified parser that works with direct examples"""
    
    def __init__(self, openai_api_key: str, model_name: str = "gpt-4o-mini"):
        self.llm = ChatOpenAI(
            temperature=0.0,
            model_name=model_name,
            openai_api_key=openai_api_key,
            max_tokens=4000
        )
        
        # Use JSON output parser instead of Pydantic
        self.output_parser = JsonOutputParser()
        
        # Create the chain
        self.chain = self._create_chain()
    
    def parse_chunk(self, text: str) -> Dict[str, Any]:
        """Parse a chunk of catalog text"""
        
        # Clean the text
        # text = re.sub(r'<a id=[\'"][^\'"]+[\'"]></a>\n*', '', text)
        # text = re.sub(r'<::(.*?)::>', r'\1', text, flags=re.DOTALL)
        # text = text.strip()
        
        try:
            with get_openai_callback() as cb:
                result = self.chain.invoke({"input": text})
                print("Prompt tokens:", cb.prompt_tokens)
                print("Completion tokens:", cb.completion_tokens)
                print("Total tokens:", cb.total_tokens)
                print("Costo (USD):", cb.total_cost)
                return result
        except Exception as e:
            print(f"Parsing error: {e}")
            return {"stamps": [], "error": str(e)}
    
    def _create_chain(self):
        """Create the parsing chain with explicit examples"""
        
        # Define examples with exact input/output
        examples = [
            {
                "input": """1889 Black Overprint
23 A8 1c rose     5.00 3.00
24 A9 5c brown    7.00 3.00
Vertical and inverted overprints are fakes.""",
                "output": json.dumps({
                    "stamps": [
                        {
                            "scott_number": "23",
                            "illustration": "A8",
                            "denomination": "1c",
                            "color": "rose",
                            "mint_value": 5.00,
                            "used_value": 3.00
                        },
                        {
                            "scott_number": "24",
                            "illustration": "A9",
                            "denomination": "5c",
                            "color": "brown",
                            "mint_value": 7.00,
                            "used_value": 3.00
                        }
                    ],
                    "header": "1889 Black Overprint",
                    "notes": ["Vertical and inverted overprints are fakes."]
                }, indent=2)
            },
            {
                "input": """1901, Jan.                                Perf. 12-15½
45 A30    1c green & blk                  3.25    .30
a.        Horiz. pair, imperf. btwn.      150.00
46 A31    2c ver & blk                    1.25    .30""",
                "output": json.dumps({
                    "stamps": [
                        {
                            "scott_number": "45",
                            "illustration": "A30",
                            "denomination": "1c",
                            "color": "green & blk",
                            "mint_value": 3.25,
                            "used_value": 0.30,
                            "perforation": "12-15½"
                        },
                        {
                            "scott_number": "45a",
                            "variety_of": "45",
                            "description": "Horiz. pair, imperf. btwn.",
                            "mint_value": 150.00
                        },
                        {
                            "scott_number": "46",
                            "illustration": "A31",
                            "denomination": "2c",
                            "color": "ver & blk",
                            "mint_value": 1.25,
                            "used_value": 0.30,
                            "perforation": "12-15½"
                        }
                    ],
                    "header": "1901, Jan.",
                    "perforation": "Perf. 12-15½"
                }, indent=2)
            },
            {
                "input": """<::A dark-colored postage stamp with a portrait of a man with a mustache in the center. The stamp has "CORREOS" and "COSTA RICA" written in a circular pattern around the portrait. The number "5" is visible in the top left and bottom right corners, and "CENTAVOS" is at the bottom. The stamp has perforated edges. : postage stamp::>
President Bernardo Soto Alfaro — A7""",
                "output": json.dumps({
                    "stamps": [],
                    "illustrations": [
                        {
                            "illustration_number": "A7",
                            "design_name": "President Bernardo Soto Alfaro",
                            "design_description": "A dark-colored postage stamp with a portrait of a man with a mustache in the center. The stamp has \"CORREOS\" and \"COSTA RICA\" written in a circular pattern around the portrait. The number \"5\" is visible in the top left and bottom right corners, and \"CENTAVOS\" is at the bottom. The stamp has perforated edges.",
                            "denomination": "5 CENTAVOS"
                        }
                    ]
                }, indent=2)
            },
            {
                "input": """<::A collection of six postage stamps, each featuring a portrait of President Soto Alfaro. The stamps are arranged in two columns. Top row: - Left stamp (A10): A brown stamp with a portrait of a man, labeled "COSTA RICA" at the top and "1 CENTAVO" at the bottom, with the number "1" in the upper corners. - Right stamp (A11): A greenish-blue stamp with a portrait of a man, labeled "COSTA RICA" at the top and "2 CENTAVOS" at the bottom, with the number "2" in the upper corners. Middle row: - Left stamp (A12): A reddish-orange stamp with a portrait of a man, labeled "COSTA RICA" at the top and "5 CENTAVOS" at the bottom, with the number "5" in the upper corners. - Right stamp (A13): A reddish-orange stamp with a portrait of a man, labeled "COSTA RICA" at the top and "10 CENTAVOS" at the bottom, with the number "10" in the upper corners. Bottom row: - Left stamp (A14): A green stamp with a portrait of a man, labeled "COSTA RICA" at the top and "20 CENTAVOS" at the bottom, with the number "20" in the upper corners. - Right stamp (A15): A reddish-orange stamp with a portrait of a man, labeled "COSTA RICA" at the top and "50 CENTAVOS" at the bottom, with the number "50" in the upper corners. : figure::>""",
                "output": json.dumps({
                    "stamps": [],
                    "illustrations": [
                        {
                            "illustration_number": "A10",
                            "design_description": "A brown stamp with a portrait of President Soto Alfaro",
                            "denomination": "1 CENTAVO",
                            "color": "brown"
                        },
                        {
                            "illustration_number": "A11",
                            "design_description": "A greenish-blue stamp with a portrait of President Soto Alfaro",
                            "denomination": "2 CENTAVOS",
                            "color": "greenish-blue"
                        },
                        {
                            "illustration_number": "A12",
                            "design_description": "A reddish-orange stamp with a portrait of President Soto Alfaro",
                            "denomination": "5 CENTAVOS",
                            "color": "reddish-orange"
                        },
                        {
                            "illustration_number": "A13",
                            "design_description": "A reddish-orange stamp with a portrait of President Soto Alfaro",
                            "denomination": "10 CENTAVOS",
                            "color": "reddish-orange"
                        },
                        {
                            "illustration_number": "A14",
                            "design_description": "A green stamp with a portrait of President Soto Alfaro",
                            "denomination": "20 CENTAVOS",
                            "color": "green"
                        },
                        {
                            "illustration_number": "A15",
                            "design_description": "A reddish-orange stamp with a portrait of President Soto Alfaro",
                            "denomination": "50 CENTAVOS",
                            "color": "reddish-orange"
                        }
                    ]
                }, indent=2)
            }
        ]
        
        # Create the few-shot prompt
        example_prompt = ChatPromptTemplate.from_messages([
            ("human", "{input}"),
            ("ai", "{output}")
        ])
        
        few_shot_prompt = FewShotChatMessagePromptTemplate(
            example_prompt=example_prompt,
            examples=examples,
        )
        
        # Final prompt
        final_prompt = ChatPromptTemplate.from_messages([
            ("system", """
    You are a Scott catalog parser. Extract stamp information and illustration descriptions from catalog text.

    CONTENT TYPES:
    1. STAMP ENTRIES: Lines starting with numbers (45, 46, 47) containing catalog data
    2. VARIETIES: Lines starting with letters (a., b., c.) that modify the stamp above
    3. ILLUSTRATIONS: Text within <::...::> describes stamp images, often followed by illustration numbers (A7, A10, etc.)

    RULES FOR STAMPS:
    - Main stamps start with numbers
    - Format: [number] [illustration] [denomination] [color] [mint_price] [used_price]
    - Multi-line entries continue from above
    - Prices: "3.25 .30" = mint $3.25, used $0.30
    - Dash (—) = no price

    RULES FOR ILLUSTRATIONS:
    - Text between <:: and ::> is a design description
    - Look for illustration numbers (A10, A11) within or after the description
    - Extract denomination and color when mentioned
    - May describe single or multiple stamps

    EXAMPLE COLOR ABBREVIATIONS:
    blk=black, grn=green, ver=vermillion, lil=lilac, ol=olive, bis=bistre, car=carmine, yel=yellow, brn=brown, dk=dark

    ALWAYS return valid JSON with the keys: "stamps", "illustrations", "header" and "notes" .
    Example of JSON for return:

    "stamps": [],        // If there are stamps if not empty
    "illustrations": [], // If there are illustrations if not empty
    "header": "1901, Jan.",  //Always if is possible 
    "notes": ["...."] //Always if there are notes if not empty


"""),
            few_shot_prompt,
            ("human", "{input}")
        ])
        
        return final_prompt | self.llm | self.output_parser
    
    def parse_and_display(self, text: str):
        """Parse and display results in a readable format"""
        
        # print("INPUT TEXT:")
        # print("-" * 60)
        # print(text[:500] + "..." if len(text) > 500 else text)
        # print("-" * 60)
        
        result = self.parse_chunk(text)
        print(result)
                
        print("\nPARSING RESULTS:")
        print("-" * 60)
        
        if "error" in result:
            print(f"ERROR: {result['error']}")
            return result
        
        # Display stamps
        stamps = result.get("stamps", [])
        if stamps:
            print(f"Found {len(stamps)} stamps:\n")
            
            for stamp in stamps:
                if stamp.get("variety_of"):
                    print(f"  └─ #{stamp['scott_number']}: {stamp.get('description', 'Variety')}")
                    if stamp.get('mint_value'):
                        print(f"      Value: ${stamp['mint_value']}")
                else:
                    print(f"#{stamp['scott_number']} ({stamp.get('illustration', 'N/A')}): "
                         f"{stamp.get('denomination', '')} {stamp.get('color', '')}")
                    if stamp.get('mint_value'):
                        print(f"  Values: ${stamp['mint_value']} mint / "
                             f"${stamp.get('used_value', 'N/A')} used")
        
        # Display illustrations
        illustrations = result.get("illustrations", [])
        if illustrations:
            print(f"\nFound {len(illustrations)} illustration descriptions:\n")
            
            for illus in illustrations:
                print(f"Illustration {illus['illustration_number']}:")
                if illus.get('design_name'):
                    print(f"  Name: {illus['design_name']}")
                if illus.get('denomination'):
                    print(f"  Denomination: {illus['denomination']}")
                if illus.get('color'):
                    print(f"  Color: {illus['color']}")
                desc = illus.get('design_description', '')
                if desc:
                    # Truncate long descriptions for display
                    if len(desc) > 100:
                        print(f"  Description: {desc[:100]}...")
                    else:
                        print(f"  Description: {desc}")
        
        # Display notes
        if result.get("notes"):
            print("\nNOTES:")
            for note in result["notes"]:
                print(f"  • {note}")
        
        # Display header info
        if result.get("header"):
            print(f"\nHEADER: {result['header']}")
        if result.get("perforation"):
            print(f"PERFORATION: {result['perforation']}")
        
        print("-" * 60)
        return result


# Test function
def test_parser():
    """Test the parser with your actual chunk"""
    
    # Your actual chunk text
    chunk_text = """1881-82
Red or Black Surcharge
7 A1(a) 1c on ½r ('82) 3.00 6.00
a. On No. 1a 15.00 -
8 A1(b) 1c on ½r ('82) 18.00 30.00
9 A1(c) 2c on ½r, #1a 3.00 2.75
a. On No. 1 8.00
12 A1(c) 5c on ½r 15.00
13 A1(d) 5c on ½r ('82) 35.00
14 A1(d) 10c on 2r (Bk)
('82) 72.50 -
15 A1(e) 20c on 4r ('82) 300.00 -

Overprints with different fonts and "OFICIAL" were never placed in use, and are said to have been surcharged to a dealer's order. The ½r surcharged "DOS CTS" is not a postage stamp. It probably is an essay.
Postally used examples of Nos. 7-15 are rare. Nos. 13-15 exist with a favor cancel having a hyphen between "San" and "Jose." Values same as unused. Fake cancellations exist.
Counterfeits exist of surcharges on Nos. 7-15.
---------------
 stamp of Gen. Prospero Fernández : figure

Gen. Prospero
Fernández - A6

1883, Jan. 1

| | | | | |
|---|---|---|---|---|
| 16 | A6 | 1c green | 3.00 | 1.50 |
| 17 | A6 | 2c carmine | 3.25 | 1.50 |
| 18 | A6 | 5c blue violet | 32.50 | 2.00 |
| 19 | A6 | 10c orange | 150.00 | 12.00 |
| 20 | A6 | 40c blue | 3.00 | 3.00 |
| Nos. 16-20 (5) | | | 191.75 | 20.00 |

Unused examples of 40c usually lack gum.
For overprints see Nos. O1-O20, O24,
Guanacaste 1-38, 44.
    
"""
    
    # Initialize parser
    parser = SimpleScottParser(
        openai_api_key=os.getenv("OPENAI_API_KEY", "your-api-key"),
        model_name="gpt-4o-mini"
    )
    
    # Parse and display
    result = parser.parse_and_display(chunk_text)
    
    # Save results
    with open("scott_parse_results.json", "w") as f:
        json.dump(result, f, indent=2)
    
    print(f"\nResults saved to scott_parse_results.json")
    
    # Summary
    stamps = result.get("stamps", [])
    main_stamps = [s for s in stamps if not s.get("variety_of")]
    varieties = [s for s in stamps if s.get("variety_of")]
    
    
    print(f"\nSUMMARY:")
    print(f"  Total entries: {len(stamps)}")
    print(f"  Main stamps: {len(main_stamps)}")
    print(f"  Varieties: {len(varieties)}")
    
    return result


# if __name__ == "__main__":
#     # Run the test
#     test_parser()

In [None]:
len(group_chunks_merged)

In [None]:
group_chunk = group_chunks_merged[592]
group_text = ""
for chunk in group_chunk:
        # print(f"Tipo: {chunk.get('type', 'N/A')}")
        # print(f"Texto: {_clean_chunk_text(chunk.get('markdown', 'N/A'))}")
        # print(f"Página: {chunk.get('grounding', 'N/A')['page']}")
        # print("---------------")
        group_text += _clean_chunk_text(chunk.get('markdown', 'N/A'))
print(group_text)        


In [None]:
parser = SimpleScottParser(
openai_api_key=os.getenv("OPENAI_API_KEY", "your-api-key"),
model_name="gpt-4o-mini"
)

# Parse and display
result = parser.parse_and_display(group_text)

# Save results
with open("scott_parse_results_1-17.json", "w") as f:
    json.dump(result, f, indent=2)

print(f"\nResults saved to scott_parse_results.json")

# Summary
stamps = result.get("stamps", [])
main_stamps = [s for s in stamps if not s.get("variety_of")]
varieties = [s for s in stamps if s.get("variety_of")]
illustrations = result.get("illustrations", [])

print(f"\nSUMMARY:")
print(f"  Total entries: {len(stamps)}")
print(f"  Main stamps: {len(main_stamps)}")
print(f"  Varieties: {len(varieties)}")
print(f"  Illustrations: {len(illustrations)}")

In [None]:
results = []
error_groups = []

In [None]:
import os, json, time, datetime, traceback
from tqdm import tqdm


parser = SimpleScottParser(
    openai_api_key=os.getenv("OPENAI_API_KEY", "your-api-key"),
    model_name="gpt-4o-mini"
)

total = len(group_chunks_merged)
start = time.perf_counter()

start_num = 1
start_idx = start_num - 1  # = 13

remaining = len(group_chunks_merged[start_idx:])


with tqdm(total=total, desc="Parseando grupos", unit="grp") as pbar:
    for i, group_chunk in enumerate(group_chunks_merged[start_idx:], start_num):
        t0 = time.perf_counter()
        try:
            group_text = "".join(_clean_chunk_text(ch.get('markdown', 'N/A')) for ch in group_chunk)
            result = parser.parse_and_display(group_text)
            results.append(result)

        except Exception as e:
            # Guarda índice, error y (opcional) un recorte del texto para depurar
            error_groups.append({
                "group_number": i,
                "error": str(e),
                "traceback": traceback.format_exc()
                # Si quieres: "sample": group_text[:500] if 'group_text' in locals() else ""
            })
        finally:
            # Actualiza métricas/ETA y la barra aunque haya fallo
            iter_sec = time.perf_counter() - t0
            elapsed = time.perf_counter() - start
            done = (i - start_num + 1)  # iteraciones totales (éxito+fallo) desde que empezaste
            avg = elapsed / done
            remaining_sec = avg * (remaining - done)
            eta = datetime.timedelta(seconds=max(0, int(remaining_sec)))

            pbar.set_postfix(iter_s=f"{iter_sec:.2f}", avg_s=f"{avg:.2f}", eta=str(eta))
            pbar.update(1)




In [None]:
# Guardar
with open("results/parsed_catalogues/scott_parse_results_1-17.json", "w", encoding="utf-8") as f:
    json.dump(results, f, indent=2, ensure_ascii=False)

print(f"\nResults saved to scott_parse_results_1-17.json")
print(f"Tiempo total: {datetime.timedelta(seconds=int(time.perf_counter()-start))}")

### Codigo en Batches

In [None]:
import os, json, time, datetime
from time import sleep
from tqdm import tqdm
from itertools import islice
from langchain_community.callbacks.manager import get_openai_callback

def chunked(it, size):
    it = iter(it)
    while True:
        batch = list(islice(it, size))
        if not batch:
            break
        yield batch

# --- Preparación de entradas ---
inputs = []
start_num = 1
start_idx = start_num - 1
for i, group_chunk in enumerate(group_chunks_merged[start_idx:], start_num):
    try:
        group_text = "".join(_clean_chunk_text(ch.get('markdown', 'N/A')) for ch in group_chunk)
        inputs.append({"i": i, "input": group_text})
    except Exception as e:
        # Si incluso preparar el texto falla, lo registramos y NO lo mandamos al LLM
        # (opcional: podrías agregarlo igual y que falle abajo)
        pass

max_concurrency = 3      # ajusta según límites
subbatch_size   = 5     # tamaño de oleadas
max_retries     = 2      # reintentos por oleada

results = [None] * len(inputs)
error_groups = []

t0 = time.perf_counter()
with tqdm(total=len(inputs), desc="Parseando (batch)", unit="grp") as pbar:
    with get_openai_callback() as cb:
        base = 0
        for sub in chunked(inputs, subbatch_size):
            sub_payload = [{"input": s["input"]} for s in sub]

            # --- Llamada batch con reintentos (try/except) ---
            outs = None
            last_err = None
            for attempt in range(1, max_retries + 1):
                try:
                    outs = parser.chain.batch(
                        sub_payload,
                        config={"max_concurrency": max_concurrency},
                        return_exceptions=True  # <- errores por ítem como objetos Exception
                    )
                    break  # éxito: salimos del bucle de reintentos
                except Exception as e:
                    last_err = e
                    # Backoff exponencial simple
                    if attempt < max_retries:
                        sleep(2 ** (attempt - 1))
                    else:
                        # Si falló toda la oleada tras reintentos, marcamos todos los ítems de esta oleada como error
                        for j in range(len(sub)):
                            idx_global = base + j
                            item = inputs[idx_global]
                            error_groups.append({
                                "group_number": item["i"],
                                "error": f"BATCH_FAILURE: {type(e).__name__}: {str(e)}"
                            })
                        # avanzamos la barra igualmente
                        pbar.update(len(sub))

            if outs is None:
                # Ya registramos los errores y actualizamos pbar arriba
                base += len(sub)
                continue

            # --- Procesar salidas por ítem (try implícito con return_exceptions=True) ---
            for j, out in enumerate(outs):
                idx_global = base + j
                item = inputs[idx_global]
                if isinstance(out, Exception):
                    error_groups.append({
                        "group_number": item["i"],
                        "error": f"ITEM_FAILURE: {type(out).__name__}: {str(out)}"
                    })
                else:
                    results[idx_global] = out
                pbar.update(1)

            base += len(sub)

        print(f"\nTokens prompt: {cb.prompt_tokens} | completion: {cb.completion_tokens} | total: {cb.total_tokens}")
        print(f"Costo total (USD): {cb.total_cost:.6f}")

elapsed = time.perf_counter() - t0
print(f"Tiempo total: {datetime.timedelta(seconds=int(elapsed))}")

# --- Guardar ---
os.makedirs("results/parsed_catalogues", exist_ok=True)
ok = [r for r in results if r is not None]
with open("results/parsed_catalogues/scott_parse_results_18-34.json", "w", encoding="utf-8") as f:
    json.dump(ok, f, indent=2, ensure_ascii=False)

with open("results/parsed_catalogues/scott_parse_errors_18-34.json", "w", encoding="utf-8") as f:
    json.dump(error_groups, f, indent=2, ensure_ascii=False)

print(f"OK: {len(ok)} | Errores: {len(error_groups)}")


## Catalogo Mena Parser

In [None]:
"""
Mena Catalog Parser (Costa Rica) — Simplified, Robust, English-only
Author: (Your Name)

- Parses Mena-style catalog fragments into a normalized JSON:
  {
    "issue_data": {...},
    "production_orders": {...},
    "stamps": [...],
    "varieties": [...],
    "proofs": {...},
    "essays": [...],
    "specimens": []
  }

- If a section is not present in the text, it is still returned
  with empty containers (e.g., "specimens": []).

- Few-shot examples: surcharges/varieties, production orders/plates, proofs DP/PP.
"""

import os
import json
import re
from typing import List, Dict, Any, Optional
from datetime import datetime

from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate, FewShotChatMessagePromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain_community.callbacks import get_openai_callback
from langchain_ibm import ChatWatsonx



# ----------------------------
# Helpers for normalization
# ----------------------------

def remove_null_quantities(data: Dict[str, Any]) -> None:
    """Remove production order quantities with null values."""
    prod = data.get("production_orders", {})
    printings = prod.get("printings", [])
    
    for printing in printings:
        quantities = printing.get("quantities", [])
        # Keep only non-null quantities
        valid_quantities = [
            q for q in quantities 
            if q.get("quantity") is not None
        ]
        printing["quantities"] = valid_quantities
        
        if not valid_quantities:
            print(f"⚠️ Printing on {printing.get('date')} has no valid quantities")
    
    # Remove printings with no quantities
    prod["printings"] = [p for p in printings if p.get("quantities")]

def validate_varieties(data: Dict[str, Any]) -> None:
    """Remove varieties whose base doesn't exist in stamps OR postal_stationery."""
    stamps = data.get("stamps", [])
    postal_stationery = data.get("postal_stationery", [])
    
    valid_catalog_nos = {s.get("catalog_no", "") for s in stamps}
    valid_catalog_nos.update({ps.get("catalog_no", "") for ps in postal_stationery})
    
    # ← DEBUG
    print(f"\nDEBUG validate_varieties:")
    print(f"  Valid catalog nos: {valid_catalog_nos}")
    
    varieties = data.get("varieties", [])
    print(f"  Varieties to validate: {len(varieties)}")
    
    valid_varieties = []
    
    for v in varieties:
        base = v.get("base_catalog_no", "")
        suffix = v.get("suffix", "")
        print(f"  Checking: base='{base}', suffix='{suffix}' -> '{base}' in valid? {base in valid_catalog_nos}")
        
        if base in valid_catalog_nos:
            valid_varieties.append(v)
        else:
            print(f"  ⚠️ Removed: {base}{suffix} (base not found)")
    
    data["varieties"] = valid_varieties
    print(f"  Final varieties: {len(valid_varieties)}\n")

def mena_schema_stub() -> Dict[str, Any]:
    """Return the full output schema with empty defaults."""
    return {
        "issue_data": {
            "issue_id": "",
            "section": "",
            "title": "",
            "country": "",
            "issue_dates": {
                "announced": None,
                "placed_on_sale": None,
                "probable_first_circulation": None,
                "second_plate_sale": None,
                "demonetized": None
            },
            "legal_basis": [],
            "currency_context": {
                "original": "",
                "decimal_adoption": None,
                "revaluation_date": None,
                "revaluation_map": {}
            },
            "printing": {
                "printer": "",
                "process": [],
                "format": {
                    "panes": None
                },
                "plates": {}
            },
            "perforation": ""
        },
        "production_orders": {
            "printings": [],
            "remainders": {
                "date": None,
                "note": "",
                "quantities": []
            }
        },
        "stamps": [],
        "varieties": [],
        "proofs": {
            "die_proofs": [],
            "plate_proofs": [],
            "color_proofs": [],
            "imperforate_proofs": []
        },
        "essays": [],
        "specimens": [],
        "postal_stationery": []
    }

def migrate_lettered_to_varieties(data: Dict[str, Any]) -> None:
    """
    Move only lowercase-suffixed catalog numbers (e.g., '17a', '7b') into 'varieties'.
    Preserve uppercase catalog numbers (e.g., '1A') as MAIN STAMPS.
    """
    stamps = data.get("stamps", [])
    keep_stamps = []
    varieties = data.setdefault("varieties", [])

    # ONLY lowercase letters count as variety suffix
    VARIETY_RE = re.compile(r"^(\d+)([a-z]+)$")  # <-- lowercase only

    for s in stamps:
        cat = (s.get("catalog_no") or "").strip()
        m = VARIETY_RE.fullmatch(cat)
        if m:
            base_no, suffix = m.group(1), m.group(2)
            color = (s.get("color") or "").strip()
            notes = s.get("notes") or []
            desc = ", ".join([p for p in [color] + notes[:1] if p])

            varieties.append({
                "base_catalog_no": base_no,
                "suffix": suffix,
                "type": "color" if color else "other",
                "description": desc or "variety",
                "position": None,
                "plate": s.get("plate")
            })
        else:
            keep_stamps.append(s)

    data["stamps"] = keep_stamps

def normalize_variety_types(data):
    mapping = {
        "op": "overprint",
        "overprnt": "overprint",
        "o/p": "overprint",
        "surch": "surcharge",
        "dbl surcharge": "surcharge",
        "double op": "overprint",
        "double overprint": "overprint",
        "inv op": "overprint",
        "inverted op": "overprint"
    }
    for v in data.get("varieties", []):
        t = (v.get("type") or "").strip().lower()
        desc = (v.get("description") or "").strip()
        # re-type by description hints if type is empty/other
        if not t or t == "other":
            lower = desc.lower()
            if "surcharge" in lower or "surc" in lower:
                v["type"] = "surcharge"
            elif "overprint" in lower or " op" in lower or "op " in lower:
                v["type"] = "overprint"
        # canonicalize common aliases
        v["type"] = mapping.get(v["type"], v["type"])
        v["description"] = desc

def dedupe_varieties(data):
    seen = set()
    uniq = []
    for v in data.get("varieties", []):
        key = (
            v.get("base_catalog_no",""),
            v.get("suffix",""),
            (v.get("type") or "").lower(),
            (v.get("description") or "").lower()
        )
        if key not in seen:
            seen.add(key)
            uniq.append(v)
    data["varieties"] = uniq

def _normalize_mena_unit(raw: Optional[str], value: Optional[float]) -> Optional[str]:
    """
    Normalize Mena monetary units PRESERVING CASE DISTINCTION:
      - centavo/céntimo/centime/etc.  -> "c" (lowercase)
      - colón/colon/colones/₡         -> "C" (uppercase)
      - peso/pesos                    -> "P" (uppercase)
      - real/reales/r/rs              -> "real" or "reales" (plural based on value)
    Leaves unknowns untouched.
    """
    if raw is None:
        return raw
    
    # Preserve original for case-sensitive checks
    original = raw.strip()
    
    # Lowercase version for synonym matching
    u_lower = original.lower()
    
    # Remove common punctuation/symbol noise from lowercase version
    u_normalized = u_lower.replace(".", "").replace("-", "").replace("_", "").strip()

    # CRITICAL: Check single-letter codes FIRST using original case
    if original == "c":
        return "c"  # centavo (lowercase)
    if original == "C":
        return "C"  # Colón (uppercase)
    if original == "P" or original == "p":
        return "P"  # Peso

    # Now check multi-character synonyms using lowercase version
    
    # Cent equivalents → "c"
    cent_syn = {
        "ct", "cts",
        "cent", "cents",
        "centavo", "centavos",
        "centimo", "centimos",
        "céntimo", "céntimos",
        "centime", "centimes",
        "¢"
    }

    # Peso equivalents → "P"
    peso_syn = {"peso", "pesos", "$p", "₱"}

    # Colón equivalents → "C"
    colon_syn = {"colon", "colón", "colones", "crc", "₡", "costa rican colon"}

    # Real equivalents → "real"/"reales"
    real_syn = {"r", "rs", "real", "reales"}

    # Check synonyms (order doesn't matter now since single letters handled above)
    if u_normalized in cent_syn:
        return "c"
    if u_normalized in peso_syn:
        return "P"
    if u_normalized in colon_syn:
        return "C"
    if u_normalized in real_syn:
        # Pick singular/plural based on value (best effort)
        try:
            if isinstance(value, str):
                # parse simple fractions like "1/2"
                if re.fullmatch(r"\d+/\d+", value):
                    num, den = value.split("/")
                    value_num = float(num) / float(den)
                else:
                    value_num = float(value)
            else:
                value_num = float(value) if value is not None else None
        except Exception:
            value_num = None

        if value_num is not None and value_num <= 1.0:
            return "real"  # singular para 0.5, 1.0
        else:
            return "reales"   # plural
    
    # Unknown → return original (preserve for safety)
    return original


def normalize_units_and_perf(data: Dict[str, Any]) -> None:
    """
    - Normalize denomination units according to Mena system:
        centavo/céntimo → "c"
        peso            → "P"
        colón           → "C"
        real/reales     → "real"/"reales" (plurality by value)
    - Normalize perforations: keep only the gauge if input has 'perf ...'
    """
    # Stamp-level normalization
    for s in data.get("stamps", []):
        den = s.get("denomination") or {}
        value = den.get("value")
        unit_raw = den.get("unit")
        den["unit"] = _normalize_mena_unit(unit_raw, value)
        s["denomination"] = den

        # perforation: drop the word 'perf' and punctuation; keep the gauge text intact
        perf = (s.get("perforation") or "").strip()
        perf_l = perf.lower()
        if perf_l.startswith("perf"):
            perf = perf_l.replace("perf", "", 1).strip()
            perf = perf.lstrip(".:;,- ")
        s["perforation"] = perf

    # Issue-level perforation (if ever present there)
    issue = data.get("issue_data", {}) or {}
    perf_issue = (issue.get("perforation") or "").strip()
    perf_issue_l = perf_issue.lower()
    if perf_issue_l.startswith("perf"):
        perf_clean = perf_issue_l.replace("perf", "", 1).strip()
        perf_clean = perf_clean.lstrip(".:;,- ")
        issue["perforation"] = perf_clean
        data["issue_data"] = issue

def coerce_notes_fields(obj):
    if isinstance(obj, dict):
        for k, v in list(obj.items()):
            if k == "notes":
                if isinstance(v, list):
                    # filtra no-strings y une con espacio
                    obj[k] = " ".join([str(x) for x in v if isinstance(x, (str, int, float))]).strip()
                elif v is None:
                    obj[k] = ""
                elif not isinstance(v, str):
                    obj[k] = str(v)
            else:
                coerce_notes_fields(v)
    elif isinstance(obj, list):
        for item in obj:
            coerce_notes_fields(item)


def coerce_to_mena_schema(obj: Dict[str, Any]) -> Dict[str, Any]:
    """
    Ensure all top-level keys exist and have the right container types,
    filling missing keys with empty defaults. Also ensure nested
    structures exist even if empty data.
    """
    base = mena_schema_stub()

    def ensure(target: Dict[str, Any], default: Dict[str, Any]):
        for k, v in default.items():
            if k not in target:
                target[k] = v
            else:
                if isinstance(v, dict):
                    if not isinstance(target[k], dict):
                        target[k] = v
                    else:
                        ensure(target[k], v)

    if not isinstance(obj, dict):
        return base

    for k in base.keys():
        if k not in obj:
            obj[k] = base[k]

    ensure(obj, base)

    if not isinstance(obj["stamps"], list):
        obj["stamps"] = []
    if not isinstance(obj["varieties"], list):
        obj["varieties"] = []
    if not isinstance(obj["essays"], list):
        obj["essays"] = []
    if not isinstance(obj["specimens"], list):
        obj["specimens"] = []
    if not isinstance(obj["postal_stationery"], list):
        obj["postal_stationery"] = []    
        

    proofs = obj.get("proofs", {})
    if not isinstance(proofs, dict):
        obj["proofs"] = mena_schema_stub()["proofs"]
    else:
        for k in ["die_proofs", "plate_proofs", "color_proofs", "imperforate_proofs"]:
            if k not in proofs or not isinstance(proofs[k], list):
                proofs[k] = []

    prod = obj.get("production_orders", {})
    if not isinstance(prod, dict):
        obj["production_orders"] = mena_schema_stub()["production_orders"]
    else:
        if "printings" not in prod or not isinstance(prod["printings"], list):
            prod["printings"] = []
        rem = prod.get("remainders", {})
        if not isinstance(rem, dict):
            prod["remainders"] = mena_schema_stub()["production_orders"]["remainders"]
        else:
            if "quantities" not in rem or not isinstance(rem["quantities"], list):
                rem["quantities"] = []

    return obj


# ----------------------------
# The Parser
# ----------------------------

class MenaParser:
    """LLM-driven parser specialized for Mena Catalog (Costa Rica)."""

    def __init__(self, openai_api_key: str, model_name: str = "gpt-4o-mini", temperature: float = 0.0):
        self.llm = ChatOpenAI(
            temperature=temperature,
            model=model_name,
            api_key=openai_api_key,
            timeout=300.0,
            model_kwargs={
                "verbosity": "low",
                "reasoning_effort" : "low"
            }
            #max_tokens=28000,
        )
        # self.llm = ChatWatsonx(
        #   model_id= "meta-llama/llama-3-3-70b-instruct", #"openai/gpt-5-mini"
        #   url=os.getenv("WATSONX_URL"),
        #   apikey=os.getenv("WATSONX_API_KEY"),
        #   project_id=os.getenv("WATSONX_PROJECT_ID"),
        #   params={
        #       "temperature": 0,
        #       "verbosity": "low",
        #       "reasoning_effort" : "low"
        #   })
        self.output_parser = JsonOutputParser()
        self.chain = self._create_chain()
    
    
    
    def parse_chunk(self, text: str) -> Dict[str, Any]:
        """
        Parse a chunk of Mena catalog text into the normalized Mena JSON.
        Always returns the full schema; missing sections are empty.
        """
        cleaned = self._preclean_text(text)
        try:
            with get_openai_callback() as cb:
                result = self.chain.invoke({"input": cleaned})
                print("Prompt tokens:", cb.prompt_tokens)
                print("Completion tokens:", cb.completion_tokens)
                print("Total tokens:", cb.total_tokens)
                
                
                cost_per_1m_input = 0.250
                cost_per_1m_output = 2.0
                
                # Convert to cost per token
                cost_per_input_token = cost_per_1m_input / 1_000_000
                cost_per_output_token = cost_per_1m_output / 1_000_000
                
                input_cost = cb.prompt_tokens * cost_per_input_token
                output_cost = cb.completion_tokens * cost_per_output_token
                total_cost = input_cost + output_cost
                print("Cost (USD):", total_cost)
                
                
        except Exception as e:
            print(f"Parsing error: {e}")
            result = {}

        coerced = coerce_to_mena_schema(result)
        remove_null_quantities(coerced)
        migrate_lettered_to_varieties(coerced)        
        normalize_units_and_perf(coerced)        
        normalize_variety_types(coerced)
        coerce_notes_fields(coerced)
        dedupe_varieties(coerced)
        validate_varieties(coerced)
        self._light_sanitize(coerced)
        return coerced

    def parse_and_save(self, text: str, out_path: str = "mena_parse_result.json") -> Dict[str, Any]:
        result = self.parse_chunk(text)
        with open(out_path, "w", encoding="utf-8") as f:
            json.dump(result, f, indent=2, ensure_ascii=False)
        print(f"Saved: {out_path}")
        return result

    # ---------- Internals ----------

    def expand_rowspan_cells(self, html: str) -> str:
      """
      Pre-process HTML to expand rowspan cells by duplicating content.
      
      Example:
      <tr><td>A</td><td rowspan="2">shared</td><td>C</td></tr>
      <tr><td>D</td><td>E</td></tr>
      
      Becomes:
      <tr><td>A</td><td>shared</td><td>C</td></tr>
      <tr><td>D</td><td>shared</td><td>E</td></tr>
      """
      from bs4 import BeautifulSoup, NavigableString
      
      try:
          soup = BeautifulSoup(html, 'html.parser')
          
          for table in soup.find_all('table'):
              rows = table.find_all('tr')
              
              # Track which columns have active rowspans
              rowspan_tracker = {}  # {row_idx: {col_idx: (content, remaining_rows)}}
              
              for i, row in enumerate(rows):
                  cells = row.find_all(['td', 'th'])
                  current_col = 0
                  cells_to_insert = []  # (position, cell_copy)
                  
                  # First, insert any pending rowspan cells from previous rows
                  if i in rowspan_tracker:
                      for col_idx, (content, remaining) in sorted(rowspan_tracker[i].items()):
                          # Create a copy of the cell without rowspan
                          cell_copy = BeautifulSoup(str(content), 'html.parser').find(['td', 'th'])
                          if cell_copy and 'rowspan' in cell_copy.attrs:
                              del cell_copy['rowspan']
                          cells_to_insert.append((col_idx, cell_copy))
                  
                  # Now process current row's cells
                  for cell in cells:
                      # Skip columns that should have rowspan cells inserted
                      while current_col in [pos for pos, _ in cells_to_insert]:
                          current_col += 1
                      
                      rowspan = cell.get('rowspan')
                      if rowspan and int(rowspan) > 1:
                          rowspan_count = int(rowspan)
                          
                          # Schedule this cell to be inserted in following rows
                          for j in range(1, rowspan_count):
                              target_row = i + j
                              if target_row < len(rows):
                                  if target_row not in rowspan_tracker:
                                      rowspan_tracker[target_row] = {}
                                  rowspan_tracker[target_row][current_col] = (cell, rowspan_count - j)
                          
                          # Remove rowspan from current cell
                          del cell['rowspan']
                      
                      current_col += 1
                  
                  # Insert rowspan cells at correct positions
                  for pos, cell_copy in reversed(cells_to_insert):
                      # Find insertion point
                      existing_cells = row.find_all(['td', 'th'])
                      if pos < len(existing_cells):
                          existing_cells[pos].insert_before(cell_copy)
                      else:
                          row.append(cell_copy)
          
          return str(soup)
      except Exception as e:
          print(f"⚠️ Rowspan expansion failed: {e}")
          import traceback
          traceback.print_exc()
          return html   
    def _preclean_text(self, text: str) -> str:
        """
        Minimal pre-clean:
        - Keep the <:: ... ::> figure blocks (they can help LLM anchoring).
        - Remove empty anchor tags if present.
        """
        t = text
        t = re.sub(r'<a id=[\'"][^\'"]+[\'"]></a>\s*', '', t)
        
        # Expand rowspan cells if HTML detected
        if '<table' in t.lower():
            t = self.expand_rowspan_cells(t)
        
        return t.strip()

    def _light_sanitize(self, data: Dict[str, Any]) -> None:
        def strip_if_str(x):
            return x.strip() if isinstance(x, str) else x

        issue = data.get("issue_data", {})
        for k in ["issue_id", "section", "title", "country", "perforation"]:
            if k in issue:
                issue[k] = strip_if_str(issue[k])

        for s in data.get("stamps", []):
            s["color"] = strip_if_str(s.get("color", ""))
            if isinstance(s.get("notes"), list):
                s["notes"] = [strip_if_str(n) for n in s["notes"]]

        for v in data.get("varieties", []):
            v["description"] = strip_if_str(v.get("description", ""))

    def _create_chain(self):
        """
        Build the few-shot chain specialized for Mena.
        The assistant must always return a single JSON object matching the Mena schema.
        """

        # --------------------------
        # Few-shot: REGULAR + VARIETIES tables (two tables + constant plate varieties)
        # --------------------------
        ex_input_regular_var = """<table id="8-I">
        <tr><td id="8-J">Regular issue</td><td id="8-K"></td><td id="8-L"></td></tr>
        <tr><td id="8-M">1</td><td id="8-N">½ real blue (plate 1)</td><td id="8-O">3,000,000</td></tr>
        <tr><td id="8-P">1a</td><td id="8-Q">double perf horizontal</td><td id="8-R"></td></tr>
        <tr><td id="8-S">1b</td><td id="8-T">double perf diagonal</td><td id="8-U"></td></tr>
        <tr><td id="8-V">1c</td><td id="8-W">double impression at right</td><td id="8-X"></td></tr>
        <tr><td id="8-Y">1d</td><td id="8-Z">cracked plate (pos 1)</td><td id="8-10"></td></tr>
        <tr><td id="8-11">1e</td><td id="8-12">cracked plate (pos 11)</td><td id="8-13"></td></tr>
        <tr><td id="8-14">1f</td><td id="8-15">cracked plate (pos 21)</td><td id="8-16"></td></tr>
        </table>

        Constant plate varieties:
        g: period in center second star (pos 87)
        h: period in center third star (pos 89)
        I: line on top volcano (pos 96)

        <table><thead><tr><th>Col&nbsp;1</th><th>Col&nbsp;2</th></tr></thead><tbody>
        <tr><td>1A 1½ real light blue (plate 2)</td><td>2,750,000</td></tr>
        <tr><td>&nbsp;&nbsp;&nbsp;&nbsp;1Aa imperf horizontal (pair or blocks-38)</td><td></td></tr>
        <tr><td>2 2 reales scarlet</td><td>750,000</td></tr>
        <tr><td>&nbsp;&nbsp;&nbsp;&nbsp;2a engraver line through DOS (pos 1)</td><td></td></tr>
        <tr><td>3 4 reales green</td><td>70,000</td></tr>
        <tr><td>&nbsp;&nbsp;&nbsp;&nbsp;3a double entry of "Correos de Costa Rica" (pos 8)</td><td></td></tr>
        <tr><td>4 1 peso yellow</td><td>35,000</td></tr>
        </tbody></table>
        """

        ex_output_regular_var = json.dumps({
        "issue_data": {
            "issue_id": "CR-1863-FIRST-ISSUE",
            "section": "Surface Mail",
            "title": "Regular issue",
            "country": "Costa Rica",
            "issue_dates": {
            "announced": None,
            "placed_on_sale": None,
            "probable_first_circulation": None,
            "second_plate_sale": None,
            "demonetized": None
            },
            "legal_basis": [],
            "currency_context": {
            "original": "",
            "decimal_adoption": None,
            "revaluation_date": None,
            "revaluation_map": {}
            },
            "printing": {
            "printer": "",
            "process": [],
            "format": { "panes": None },
            "plates": {}
            },
            "perforation": ""
        },
        "production_orders": {
            "printings": [],
            "remainders": { "date": None, "note": "", "quantities": [] }
        },
        "stamps": [
            {
            "catalog_no": "1",
            "issue_id": "CR-1863-FIRST-ISSUE",
            "denomination": { "value": 0.5, "unit": "real" },
            "color": "blue",
            "plate": 1,
            "perforation": "",
            "watermark": None,
            "quantity_reported": 3000000,
            "status": "regular",
            "notes": []
            },
            {
            "catalog_no": "1A",
            "issue_id": "CR-1863-FIRST-ISSUE",
            "denomination": { "value": 1.5, "unit": "real" },
            "color": "light blue",
            "plate": 2,
            "perforation": "",
            "watermark": None,
            "quantity_reported": 2750000,
            "status": "regular",
            "notes": []
            },
            {
            "catalog_no": "2",
            "issue_id": "CR-1863-FIRST-ISSUE",
            "denomination": { "value": 2, "unit": "real" },
            "color": "scarlet",
            "plate": None,
            "perforation": "",
            "watermark": None,
            "quantity_reported": 750000,
            "status": "regular",
            "notes": []
            },
            {
            "catalog_no": "3",
            "issue_id": "CR-1863-FIRST-ISSUE",
            "denomination": { "value": 4, "unit": "real" },
            "color": "green",
            "plate": None,
            "perforation": "",
            "watermark": None,
            "quantity_reported": 70000,
            "status": "regular",
            "notes": []
            },
            {
            "catalog_no": "4",
            "issue_id": "CR-1863-FIRST-ISSUE",
            "denomination": { "value": 1, "unit": "P" },
            "color": "yellow",
            "plate": None,
            "perforation": "",
            "watermark": None,
            "quantity_reported": 35000,
            "status": "regular",
            "notes": []
            }
        ],
        "varieties": [
            { "base_catalog_no": "1", "suffix": "a", "type": "perforation", "description": "double perf horizontal", "position": None, "plate": None },
            { "base_catalog_no": "1", "suffix": "b", "type": "perforation", "description": "double perf diagonal", "position": None, "plate": None },
            { "base_catalog_no": "1", "suffix": "c", "type": "impression", "description": "double impression at right", "position": None, "plate": None },
            { "base_catalog_no": "1", "suffix": "d", "type": "plate_flaw", "description": "cracked plate", "position": "pos 1", "plate": 1 },
            { "base_catalog_no": "1", "suffix": "e", "type": "plate_flaw", "description": "cracked plate", "position": "pos 11", "plate": 1 },
            { "base_catalog_no": "1", "suffix": "f", "type": "plate_flaw", "description": "cracked plate", "position": "pos 21", "plate": 1 },

            { "base_catalog_no": "1", "suffix": "g", "type": "plate_flaw", "description": "period in center second star", "position": "pos 87", "plate": None },
            { "base_catalog_no": "1", "suffix": "h", "type": "plate_flaw", "description": "period in center third star", "position": "pos 89", "plate": None },
            { "base_catalog_no": "1", "suffix": "i", "type": "plate_flaw", "description": "line on top volcano", "position": "pos 96", "plate": None },

            { "base_catalog_no": "1A", "suffix": "a", "type": "perforation", "description": "imperf horizontal (pair or blocks-38)", "position": None, "plate": 2 },

            { "base_catalog_no": "2", "suffix": "a", "type": "plate_flaw", "description": "engraver line through DOS", "position": "pos 1", "plate": None },
            { "base_catalog_no": "3", "suffix": "a", "type": "plate_flaw", "description": "double entry of \"Correos de Costa Rica\"", "position": "pos 8", "plate": None }
        ],
        "proofs": { "die_proofs": [], "plate_proofs": [], "color_proofs": [], "imperforate_proofs": [] },
        "essays": [],
        "specimens": [],
        "postal_stationery" : []
        }, indent=2)

        
        
        # --------------------------
        # Few-shot: SURCHARGES block (fixed)
        # --------------------------
        ex_input_surcharges = """Surcharges 1881-82

        December 16, 1880. Accord 53 (2c), September or October 1882 (1c), December 1882 (5c).
        Surcharged by Imprenta Nacional in vermilion.
        Decimal currency was adopted in 1864: 100 centavos = 1 peso.
        Quantities unknown. Demonetized February 1, 1883. (Ref Ox 100, 1985, Ox 211, 2013).

        5
        1c on ½ real (plate 1) in straight letters

        5a
        surcharge on 1A (plate 2)

        6
        1c on ½ real (plate 1) in cursive letters

        7
        2c on ½ real (plate 1)

        7a
        surcharge on 1A (plate 2)

        7b
        double surcharge

        7c
        inverted op (one known in a block)

        7d
        Cts instead of cts (doubtful-Ox 114)

        8
        5c on ½ real 1A (plate 2) - never used

        8a
        double surcharge

        All varieties of base stamp #1 exist on stamp #5, 6 and 7.
        #5 and 6 exist se-tenant.
        Proofs in black of 5, 7 & 8 may exist.
        """

        ex_output_surcharges = json.dumps({
            "issue_data": {
                "issue_id": "CR-1881-82-SURCHARGES",
                "section": "Surface Mail",
                "title": "Surcharges 1881–82",
                "country": "Costa Rica",
                "issue_dates": {
                    "announced": None,
                    "placed_on_sale": None,
                    "probable_first_circulation": None,
                    "second_plate_sale": None,
                    "demonetized": "1883-02-01"
                },
                "legal_basis": [
                    {"type": "resolution", "id": "Accord 53", "date": "1880-12-16", "ids": [], "officials": []}
                ],
                "currency_context": {
                    "original": "c",
                    "decimal_adoption": "1864-01-01",
                    "revaluation_date": None,
                    "revaluation_map": {}
                },
                "printing": {
                    "printer": "Imprenta Nacional",
                    "process": ["surcharge"],
                    "format": {"panes": None},
                    "plates": {
                        "0.5_real": {
                            "plates": [1, 2],
                            "notes": ["plate 1 straight vs cursive fonts noted"]
                        }
                    }
                },
                "perforation": ""
            },
            "production_orders": {
                "printings": [],
                "remainders": {"date": None, "note": "", "quantities": []}
            },
            "stamps": [
                {
                    "catalog_no": "5",
                    "issue_id": "CR-1881-82-SURCHARGES",
                    "denomination": {"value": 1, "unit": "c"},
                    "color": "",
                    "plate": 1,
                    "perforation": "",
                    "watermark": None,
                    "quantity_reported": None,
                    "status": "regular",
                    "notes": [
                        "1c on 1/2 real (plate 1), straight letters",
                        "surcharge in vermilion"
                    ],
                    "overprint": {
                        "present": True,
                        "type": "surcharge",
                        "surcharge_denomination": {"value": 1, "unit": "c"},
                        "on_denomination": {"value": 0.5, "unit": "reales"}
                    },
                    "base_stamp_ref": "1"
                },
                {
                    "catalog_no": "6",
                    "issue_id": "CR-1881-82-SURCHARGES",
                    "denomination": {"value": 1, "unit": "c"},
                    "color": "",
                    "plate": 1,
                    "perforation": "",
                    "watermark": None,
                    "quantity_reported": None,
                    "status": "regular",
                    "notes": [
                        "1c on 1/2 real (plate 1), cursive letters",
                        "surcharge in vermilion"
                    ],
                    "overprint": {
                        "present": True,
                        "type": "surcharge",
                        "surcharge_denomination": {"value": 1, "unit": "c"},
                        "on_denomination": {"value": 0.5, "unit": "reales"}
                    },
                    "base_stamp_ref": "1"
                },
                {
                    "catalog_no": "7",
                    "issue_id": "CR-1881-82-SURCHARGES",
                    "denomination": {"value": 2, "unit": "c"},
                    "color": "",
                    "plate": 1,
                    "perforation": "",
                    "watermark": None,
                    "quantity_reported": None,
                    "status": "regular",
                    "notes": [
                        "2c on 1/2 real (plate 1)",
                        "surcharge in vermilion"
                    ],
                    "overprint": {
                        "present": True,
                        "type": "surcharge",
                        "surcharge_denomination": {"value": 2, "unit": "c"},
                        "on_denomination": {"value": 0.5, "unit": "reales"}
                    },
                    "base_stamp_ref": "1"
                },
                {
                    "catalog_no": "8",
                    "issue_id": "CR-1881-82-SURCHARGES",
                    "denomination": {"value": 5, "unit": "c"},
                    "color": "",
                    "plate": 2,
                    "perforation": "",
                    "watermark": None,
                    "quantity_reported": None,
                    "status": "regular",
                    "notes": [
                        "5c on 1/2 real 1A (plate 2) - never used",
                        "surcharge in vermilion"
                    ],
                    "overprint": {
                        "present": True,
                        "type": "surcharge",
                        "surcharge_denomination": {"value": 5, "unit": "c"},
                        "on_denomination": {"value": 0.5, "unit": "reales"}
                    },
                    "base_stamp_ref": "1A"
                }
            ],
            "varieties": [
                {
                    "base_catalog_no": "5",
                    "suffix": "a",
                    "type": "overprint",
                    "description": "surcharge on 1A (plate 2)",
                    "position": None,
                    "plate": 2
                },
                {
                    "base_catalog_no": "7",
                    "suffix": "a",
                    "type": "overprint",
                    "description": "surcharge on 1A (plate 2)",
                    "position": None,
                    "plate": 2
                },
                {
                    "base_catalog_no": "7",
                    "suffix": "b",
                    "type": "surcharge",
                    "description": "double surcharge",
                    "position": None,
                    "plate": None
                },
                {
                    "base_catalog_no": "7",
                    "suffix": "c",
                    "type": "overprint",
                    "description": "inverted overprint (one known in a block)",
                    "position": None,
                    "plate": None
                },
                {
                    "base_catalog_no": "7",
                    "suffix": "d",
                    "type": "overprint",
                    "description": "Cts instead of cts (doubtful - Ox 114)",
                    "position": None,
                    "plate": None
                },
                {
                    "base_catalog_no": "8",
                    "suffix": "a",
                    "type": "surcharge",
                    "description": "double surcharge",
                    "position": None,
                    "plate": None
                }
            ],
            "proofs": {
                "die_proofs": [],
                "plate_proofs": [],
                "color_proofs": [
                    {
                        "code": "",
                        "denomination": "No. 5",
                        "color": "black",
                        "notes": "Proof in black reportedly may exist (unconfirmed)"
                    },
                    {
                        "code": "",
                        "denomination": "No. 7",
                        "color": "black",
                        "notes": "Proof in black reportedly may exist (unconfirmed)"
                    },
                    {
                        "code": "",
                        "denomination": "No. 8",
                        "color": "black",
                        "notes": "Proof in black reportedly may exist (unconfirmed)"
                    }
                ],
                "imperforate_proofs": []
            },
            "essays": [],
            "specimens": [],
            "postal_stationery" : []
        }, indent=2)

        # ----------------------------------------
        # Few-shot: PRODUCTION ORDERS / PLATE DATA
        # ----------------------------------------
        ex_input_prod = """First Issue
April 11, 1863. Decree #2 of August 18, 1862. Engraved and recess printed by ABNCo. in panes of 100. Perf 12.
The half real stamp printed from two plates since the original one developed a crack.
Orders:
October 11, 1862 — 250,000 (1/2 real plate 1), 250,000 (2 reales)
September 30, 1863 — 20,000 (4 reales), 10,000 (1 peso)
September 1865 — 500,000 (1/2 real plate 1), 500,000 (2 reales), 50,000 (4 reales), 25,000 (1 peso)
December 24, 1872 — 2,000,000 (1/2 real plate 1)
June 18, 1875 — 250,000 (1/2 real plate 1), 2,750,000 (1/2 real plate 2)
Remainders sold May 23, 1883: 1,000,000 (1/2 real plate 1), 1,615,000 (1/2 real plate 2), 385,000 (2 reales), 23,000 (4 reales), 10,500 (1 peso)
"""
        ex_output_prod = json.dumps({
            "issue_data": {
                "issue_id": "CR-1863-FIRST-ISSUE",
                "section": "Surface Mail",
                "title": "First Issue",
                "country": "Costa Rica",
                "issue_dates": {
                    "announced": None,
                    "placed_on_sale": "1863-12-01",
                    "probable_first_circulation": "1863-04-11",
                    "second_plate_sale": "1875-09-01",
                    "demonetized": "1883-01-31"
                },
                "legal_basis": [
                    {"type": "decree", "id": "Decree #2", "date": "1862-08-18", "ids": [], "officials": []}
                ],
                "currency_context": {
                    "original": "real/peso",
                    "decimal_adoption": "1864-01-01",
                    "revaluation_date": "1866-06-09",
                    "revaluation_map": {"2 real": "5c", "2 reales": "25c", "4 reales": "50c"}
                },
                "printing": {
                    "printer": "ABNCo.",
                    "process": ["engraved", "recess printed"],
                    "format": {"panes": 100},
                    "plates": {"0.5_real": {"plates": [1, 2], "notes": ["plate 1 cracked"]}}
                },
                "perforation": "12"
            },
            "production_orders": {
                "printings": [
                    {"date": "1862-10-11", "quantities": [
                        {"plate_desc": "0.5_real_plate1", "quantity": 250000},
                        {"plate_desc": "2_reales", "quantity": 250000}
                    ]},
                    {"date": "1863-09-30", "quantities": [
                        {"plate_desc": "4_reales", "quantity": 20000},
                        {"plate_desc": "1_peso", "quantity": 10000}
                    ]},
                    {"date": "1865-09-01", "quantities": [
                        {"plate_desc": "0.5_real_plate1", "quantity": 500000},
                        {"plate_desc": "2_reales", "quantity": 500000},
                        {"plate_desc": "4_reales", "quantity": 50000},
                        {"plate_desc": "1_peso", "quantity": 25000}
                    ]},
                    {"date": "1872-12-24", "quantities": [
                        {"plate_desc": "0.5_real_plate1", "quantity": 2000000}
                    ]},
                    {"date": "1875-06-18", "quantities": [
                        {"plate_desc": "0.5_real_plate1", "quantity": 250000},
                        {"plate_desc": "0.5_real_plate2", "quantity": 2750000}
                    ]}
                ],
                "remainders": {
                    "date": "1883-05-23",
                    "note": "remainder sale",
                    "quantities": [
                        {"plate_desc": "0.5_real_plate1", "quantity": 1000000},
                        {"plate_desc": "0.5_real_plate2", "quantity": 1615000},
                        {"plate_desc": "2_reales", "quantity": 385000},
                        {"plate_desc": "4_reales", "quantity": 23000},
                        {"plate_desc": "1_peso", "quantity": 10500}
                    ]
                }
            },
            "stamps": [],
            "varieties": [],
            "proofs": {"die_proofs": [], "plate_proofs": [], "color_proofs": [], "imperforate_proofs": []},
            "essays": [],
            "specimens": [],
            "postal_stationery" : []
        }, indent=2)

        # ----------------------------------------
        # Few-shot: PROOFS (DP/PP) block
        # ----------------------------------------
        ex_input_proofs = """Die Proofs on India paper, imperf or sunk on card

DP1: 1/2 real black die #332
DP2: 2 reales black die #330
  DP2a: scarlet
DP3: 4 reales black die #387
DP4: 1 peso black die #388
  DP4a: green
  DP4b: brown
  DP4c: reddish brown

Plate PP1 — Proofs in India paper, imperf or on card
1/2 real blue (plate 1)
PP1a PP1b PP1c PP1d — black on backer; yellow; green; orange

PP1A — 1/2 real light blue (plate 2)

PP2 — 2 reales scarlet
PP2a — reddish purple
PP2c — yellow
PP2d — green
"""
        ex_output_proofs = json.dumps({
            "issue_data": {
                "issue_id": "CR-1863-PROOFS-SEGMENT",
                "section": "Surface Mail",
                "title": "First Issue Proofs",
                "country": "Costa Rica",
                "issue_dates": {
                    "announced": None,
                    "placed_on_sale": None,
                    "probable_first_circulation": None,
                    "second_plate_sale": None,
                    "demonetized": None
                },
                "legal_basis": [],
                "currency_context": {
                    "original": "",
                    "decimal_adoption": None,
                    "revaluation_date": None,
                    "revaluation_map": {}
                },
                "printing": {"printer": "", "process": [], "format": {"panes": None}, "plates": {}},
                "perforation": ""
            },
            "production_orders": {"printings": [], "remainders": {"date": None, "note": "", "quantities": []}},
            "stamps": [],
            "varieties": [],
            "proofs": {
                "die_proofs": [
                    {"code": "DP1", "denomination": "1/2 real", "color": "black", "die_no": "332",
                     "substrate": "India paper", "finish": "imperf or sunk on card"},
                    {"code": "DP2", "denomination": "2 reales", "color": "black", "die_no": "330",
                     "substrate": "India paper", "finish": "imperf or sunk on card"},
                    {"code": "DP2a", "denomination": "2 reales", "color": "scarlet", "die_no": "330",
                     "substrate": "India paper", "finish": "imperf or sunk on card"},
                    {"code": "DP3", "denomination": "4 reales", "color": "black", "die_no": "387",
                     "substrate": "India paper", "finish": "imperf or sunk on card"},
                    {"code": "DP4", "denomination": "1 peso", "color": "black", "die_no": "388",
                     "substrate": "India paper", "finish": "imperf or sunk on card"},
                    {"code": "DP4a", "denomination": "1 peso", "color": "green", "die_no": "388",
                     "substrate": "India paper", "finish": "imperf or sunk on card"},
                    {"code": "DP4b", "denomination": "1 peso", "color": "brown", "die_no": "388",
                     "substrate": "India paper", "finish": "imperf or sunk on card"},
                    {"code": "DP4c", "denomination": "1 peso", "color": "reddish brown", "die_no": "388",
                     "substrate": "India paper", "finish": "imperf or sunk on card"}
                ],
                "plate_proofs": [
                    {"code": "PP1", "note": "India paper; imperf or on card",
                     "items": [
                         {"variant": "PP1", "denomination": "1/2 real", "color": "blue", "plate": 1, "note": ""},
                         {"variant": "PP1a", "denomination": "", "color": "", "plate": None, "note": "black on backer"},
                         {"variant": "PP1b", "denomination": "", "color": "yellow", "plate": None, "note": ""},
                         {"variant": "PP1c", "denomination": "", "color": "green", "plate": None, "note": ""},
                         {"variant": "PP1d", "denomination": "", "color": "orange", "plate": None, "note": ""}
                     ]},
                    {"code": "PP1A", "note": "",
                     "items": [{"variant": "PP1A", "denomination": "1/2 real", "color": "light blue", "plate": 2, "note": ""}]},
                    {"code": "PP2", "note": "",
                     "items": [
                         {"variant": "PP2", "denomination": "2 reales", "color": "scarlet", "plate": None, "note": ""},
                         {"variant": "PP2a", "denomination": "", "color": "reddish purple", "plate": None, "note": ""},
                         {"variant": "PP2c", "denomination": "", "color": "yellow", "plate": None, "note": ""},
                         {"variant": "PP2d", "denomination": "", "color": "green", "plate": None, "note": ""}
                     ]}
                ],
                "color_proofs": [],
                "imperforate_proofs": []
            },
            "essays": [],
            "specimens": [],
            "postal_stationery" : []
        }, indent=2)
        
        # --------------------------
        # Few-shot: SPECIMENS block (Costa Rica MUESTRA overprints)
        # --------------------------
        ex_input_specimens = """Specimens

        Overprint "MUESTRA" in black or red:

        MA46a bk 10c scarlet
        MA47a red 15c purple
        MA48a red 25c light blue
        MA49a red 35c bistre brown
        MA50a red 60c bluish green
        MA51a red 75c olive
        MA52a bk 1.35C red orange
        MA53a red 5C sepia
        MA54a bk 10C red lilac

        Overprint "muestra" oblique in red:

        MA180 15c blue
        MA181 20c red
        MA182 35c dark green
        MA183 45c purple
        MA184 50c carmine
        MA185 75c red violet
        MA186 1C olive
        MA187 2C red brown
        MA188 5C orange yellow
        MA189 10C bright blue
        """

        ex_output_specimens = json.dumps({
            "issue_data": {
                "issue_id": "CR-SPECIMENS-SEGMENT",
                "section": "Surface Mail",
                "title": "Specimens",
                "country": "Costa Rica",
                "issue_dates": {
                    "announced": None,
                    "placed_on_sale": None,
                    "probable_first_circulation": None,
                    "second_plate_sale": None,
                    "demonetized": None
                },
                "legal_basis": [],
                "currency_context": {
                    "original": "",
                    "decimal_adoption": None,
                    "revaluation_date": None,
                    "revaluation_map": {}
                },
                "printing": {
                    "printer": "",
                    "process": [],
                    "format": {"panes": None},
                    "plates": {}
                },
                "perforation": ""
            },
            "production_orders": {
                "printings": [],
                "remainders": {"date": None, "note": "", "quantities": []}
            },
            "stamps": [],
            "varieties": [],
            "proofs": {
                "die_proofs": [],
                "plate_proofs": [],
                "color_proofs": [],
                "imperforate_proofs": []
            },
            "essays": [],
            "specimens": [
                {
                    "code": "MA46a",
                    "applies_to": "stamps",
                    "type": "overprint",
                    "denomination": "10c",
                    "base_color": "scarlet",
                    "overprint_color": "black",
                    "notes": "MUESTRA overprint"
                },
                {
                    "code": "MA47a",
                    "applies_to": "stamps",
                    "type": "overprint",
                    "denomination": "15c",
                    "base_color": "purple",
                    "overprint_color": "red",
                    "notes": "MUESTRA overprint"
                },
                {
                    "code": "MA48a",
                    "applies_to": "stamps",
                    "type": "overprint",
                    "denomination": "25c",
                    "base_color": "light blue",
                    "overprint_color": "red",
                    "notes": "MUESTRA overprint"
                },
                {
                    "code": "MA49a",
                    "applies_to": "stamps",
                    "type": "overprint",
                    "denomination": "35c",
                    "base_color": "bistre brown",
                    "overprint_color": "red",
                    "notes": "MUESTRA overprint"
                },
                {
                    "code": "MA50a",
                    "applies_to": "stamps",
                    "type": "overprint",
                    "denomination": "60c",
                    "base_color": "bluish green",
                    "overprint_color": "red",
                    "notes": "MUESTRA overprint"
                },
                {
                    "code": "MA51a",
                    "applies_to": "stamps",
                    "type": "overprint",
                    "denomination": "75c",
                    "base_color": "olive",
                    "overprint_color": "red",
                    "notes": "MUESTRA overprint"
                },
                {
                    "code": "MA52a",
                    "applies_to": "stamps",
                    "type": "overprint",
                    "denomination": "1.35C",
                    "base_color": "red orange",
                    "overprint_color": "black",
                    "notes": "MUESTRA overprint"
                },
                {
                    "code": "MA53a",
                    "applies_to": "stamps",
                    "type": "overprint",
                    "denomination": "5C",
                    "base_color": "sepia",
                    "overprint_color": "red",
                    "notes": "MUESTRA overprint"
                },
                {
                    "code": "MA54a",
                    "applies_to": "stamps",
                    "type": "overprint",
                    "denomination": "10C",
                    "base_color": "red lilac",
                    "overprint_color": "black",
                    "notes": "MUESTRA overprint"
                },
                {
                    "code": "MA180",
                    "applies_to": "stamps",
                    "type": "overprint",
                    "denomination": "15c",
                    "base_color": "blue",
                    "overprint_color": "red",
                    "notes": "MUESTRA overprint oblique"
                },
                {
                    "code": "MA181",
                    "applies_to": "stamps",
                    "type": "overprint",
                    "denomination": "20c",
                    "base_color": "red",
                    "overprint_color": "red",
                    "notes": "MUESTRA overprint oblique"
                },
                {
                    "code": "MA182",
                    "applies_to": "stamps",
                    "type": "overprint",
                    "denomination": "35c",
                    "base_color": "dark green",
                    "overprint_color": "red",
                    "notes": "MUESTRA overprint oblique"
                },
                {
                    "code": "MA183",
                    "applies_to": "stamps",
                    "type": "overprint",
                    "denomination": "45c",
                    "base_color": "purple",
                    "overprint_color": "red",
                    "notes": "MUESTRA overprint oblique"
                },
                {
                    "code": "MA184",
                    "applies_to": "stamps",
                    "type": "overprint",
                    "denomination": "50c",
                    "base_color": "carmine",
                    "overprint_color": "red",
                    "notes": "MUESTRA overprint oblique"
                },
                {
                    "code": "MA185",
                    "applies_to": "stamps",
                    "type": "overprint",
                    "denomination": "75c",
                    "base_color": "red violet",
                    "overprint_color": "red",
                    "notes": "MUESTRA overprint oblique"
                },
                {
                    "code": "MA186",
                    "applies_to": "stamps",
                    "type": "overprint",
                    "denomination": "1C",
                    "base_color": "olive",
                    "overprint_color": "red",
                    "notes": "MUESTRA overprint oblique"
                },
                {
                    "code": "MA187",
                    "applies_to": "stamps",
                    "type": "overprint",
                    "denomination": "2C",
                    "base_color": "red brown",
                    "overprint_color": "red",
                    "notes": "MUESTRA overprint oblique"
                },
                {
                    "code": "MA188",
                    "applies_to": "stamps",
                    "type": "overprint",
                    "denomination": "5C",
                    "base_color": "orange yellow",
                    "overprint_color": "red",
                    "notes": "MUESTRA overprint oblique"
                },
                {
                    "code": "MA189",
                    "applies_to": "stamps",
                    "type": "overprint",
                    "denomination": "10C",
                    "base_color": "bright blue",
                    "overprint_color": "red",
                    "notes": "MUESTRA overprint oblique"
                }
            ],
            "postal_stationery" : []
        }, indent=2)
        # --------------------------
        # Few-shot: POSTAL STATIONERY (all types)
        # --------------------------
        ex_input_postal_stationery = """Postal Stationery

        Die Proofs
        DPPC1  2c black
        DPEN1  5c black

        Overprint "muestra"
        MPC1   2c black

        Regular issue - Postal Cards
        PC1    2c black                        50,000
        PC2    4c black, with reply card       50,000

        Envelopes
        EN1    5c blue (thin paper)           180,000
        EN1a   pale blue
        EN2    10c orange yellow              75,000

        Aerogrammes
        LS1    2C multicolor                  100,000

        Official Envelopes
        OEN1   2c green, op "Servicio Oficial"
        OEN1a  inverted op

        Wrappers
        W1     2c green
        """

        ex_output_postal_stationery = json.dumps({
            "issue_data": {
                "issue_id": "CR-POSTAL-STATIONERY-SEGMENT",
                "section": "Postal Stationery",
                "title": "Postal Stationery",
                "country": "Costa Rica",
                "issue_dates": {"announced": None, "placed_on_sale": None, "probable_first_circulation": None, "second_plate_sale": None, "demonetized": None},
                "legal_basis": [],
                "currency_context": {"original": "", "decimal_adoption": None, "revaluation_date": None, "revaluation_map": {}},
                "printing": {"printer": "", "process": [], "format": {"panes": None}, "plates": {}},
                "perforation": ""
            },
            "production_orders": {"printings": [], "remainders": {"date": None, "note": "", "quantities": []}},
            "stamps": [],
            "varieties": [
                {"base_catalog_no": "EN1", "suffix": "a", "type": "color", "description": "pale blue", "position": None, "plate": None},
                {"base_catalog_no": "OEN1", "suffix": "a", "type": "overprint", "description": "inverted overprint", "position": None, "plate": None}
            ],
            "proofs": {
                "die_proofs": [
                    {"code": "DPPC1", "denomination": "2c", "color": "black", "die_no": "", "substrate": "", "finish": ""},
                    {"code": "DPEN1", "denomination": "5c", "color": "black", "die_no": "", "substrate": "", "finish": ""}
                ],
                "plate_proofs": [],
                "color_proofs": [],
                "imperforate_proofs": []
            },
            "essays": [],
            "specimens": [
                {"code": "MPC1", "applies_to": "postal_stationery", "type": "overprint", "denomination": "2c", "base_color": "black", "overprint_color": "black", "notes": "MUESTRA overprint"}
            ],
            "postal_stationery": [
                {"catalog_no": "PC1", "stationery_type": "postal_card", "denomination": {"value": 2, "unit": "c"}, "color": "black", "paper": "", "size": "", "quantity_reported": 50000, "card_type": "single", "notes": ""},
                {"catalog_no": "PC2", "stationery_type": "postal_card", "denomination": {"value": 4, "unit": "c"}, "color": "black", "paper": "", "size": "", "quantity_reported": 50000, "card_type": "reply", "notes": "with reply card"},
                {"catalog_no": "EN1", "stationery_type": "envelope", "denomination": {"value": 5, "unit": "c"}, "color": "blue", "paper": "thin paper", "size": "", "quantity_reported": 180000, "notes": ""},
                {"catalog_no": "EN2", "stationery_type": "envelope", "denomination": {"value": 10, "unit": "c"}, "color": "orange yellow", "paper": "", "size": "", "quantity_reported": 75000, "notes": ""},
                {"catalog_no": "LS1", "stationery_type": "aerogramme", "denomination": {"value": 2, "unit": "C"}, "color": "multicolor", "paper": "", "size": "", "quantity_reported": 100000, "notes": ""},
                {"catalog_no": "OEN1", "stationery_type": "official_envelope", "denomination": {"value": 2, "unit": "c"}, "color": "green", "paper": "", "size": "", "quantity_reported": None, "notes": "", "overprint": {"present": True, "type": "overprint", "text": "Servicio Oficial", "color": "black"}},
                {"catalog_no": "W1", "stationery_type": "wrapper", "denomination": {"value": 2, "unit": "c"}, "color": "green", "paper": "", "size": "", "quantity_reported": None, "notes": ""}
            ]
        }, indent=2)
        # Few-shot wrapper
        example_prompt = ChatPromptTemplate.from_messages([
            ("human", "{input}"),
            ("ai", "{output}")
        ])

        few_shot = FewShotChatMessagePromptTemplate(
            example_prompt=example_prompt,
            examples=[
                {"input": ex_input_surcharges, "output": ex_output_surcharges},
                {"input": ex_input_prod,       "output": ex_output_prod},
                {"input": ex_input_proofs,     "output": ex_output_proofs},                
                {"input": ex_input_regular_var, "output": ex_output_regular_var},
                {"input": ex_input_specimens,   "output": ex_output_specimens},
                {"input": ex_input_postal_stationery,   "output": ex_output_postal_stationery}
            ],
        )

        # System prompt — ALL BRACES ARE ESCAPED {{like this}} to avoid LangChain templating errors.
        system_prompt = """
You are a structured parser for the Mena Catalog (Costa Rica). Extract data into ONE JSON object
with EXACTLY these top-level keys:
- "issue_data"
- "production_orders"
- "stamps"
- "varieties"
- "proofs"
- "essays"
- "specimens"
- "postal_stationery"

Return the FULL schema even if empty. If a section doesn't appear, return it as empty containers:
- lists as []
- maps as {{}}
- dates as null
- strings as ""

Output schema (types and intent):

issue_data:
  - issue_id (string): Stable unique ID (recommend COUNTRY-YYYY[-YYYY]-TITLE).
  - section (string): Catalog section (e.g., "Surface Mail").
  - title (string): Issue title as printed.
  - country (string): Country name.
  - issue_dates (object):
      announced | placed_on_sale | probable_first_circulation | second_plate_sale | demonetized (ISO or null)
  - legal_basis (array of objects):
      {{ type: "decree"|"law"|"letter"|"resolution"|..., id: string, date: ISO|null, ids: [], officials: [] }}
  - currency_context (object):
      original (string), decimal_adoption (ISO|null), revaluation_date (ISO|null), revaluation_map (object string->string)
  - printing (object):
      printer (string), process (string[]), format {{ panes: number|null }}, plates {{ "<denom_token>": {{ plates: number[], notes: string[] }} }}
  - perforation (string): numeric gauge like "12" if specified, else "".

production_orders:
  - printings (array):
      {{ date: ISO|null, quantities: [ {{ plate_desc: string, quantity: number }} ] }}
  - remainders (object):
      {{ date: ISO|null, note: string, quantities: [ {{ plate_desc: string, quantity: number }} ] }}

stamps (regular issues + souvenir sheets):
  - array of objects with these fields:
    
    FOR REGULAR STAMPS:
    {{
      catalog_no: string,                       // e.g., "54", "83", "1A"
      issue_id: string,                         // link to issue_data.issue_id
      denomination: {{                          // resulting face value of the issued stamp
        value: number,
        unit: "c" | "C" | "P" | "reales"        // normalize: centavo/cts/centime -> "c"; Colón -> "C"; Peso -> "P"; real/reales -> "reales"
      }},
      color: string,                            // "" if not stated
      plate: number | null,                     // null if not given
      perforation: string | "",                 // gauge only (e.g., "12"); "" if unknown (do NOT include "perf")
      watermark: string | null,                 // null if not mentioned
      quantity_reported: number | null,         // ONLY if the specific catalog number line states a quantity; else null
      status: "regular",                        // ALWAYS "regular" for regular stamps
      notes: string[],                          // [] if none

      // ONLY when the REGULAR stamp line itself is a surcharge/overprint listing (e.g., "2c/10C"):
      overprint?: {{
        present: boolean,
        type: "surcharge" | "overprint" | "bar_cancel" | "other",
        surcharge_denomination?: {{ value: number, unit: "c" | "C" | "P" | "reales" }},
        on_denomination?: {{ value: number, unit: "c" | "C" | "P" | "reales" }},
        color?: string
      }},
      base_stamp_ref?: string                   // optional cross-ref (e.g., "1", "1A", "091")
    }}
    
    FOR SOUVENIR SHEETS (SS codes):
    {{
      catalog_no: string,                       // e.g., "SSA752", "SS123"
      issue_id: string,                         // link to issue_data.issue_id
      denomination: {{
        value: null,                            // ALWAYS null for souvenir sheets
        unit: "sheet"                           // ALWAYS "sheet" for souvenir sheets
      }},
      color: string,                            // often "multicolor"
      plate: number | null,                     // usually null
      perforation: string | "",                 // gauge if perforated (e.g., "10.5"), "" if imperf
      watermark: string | null,                 // usually null
      quantity_reported: number | null,         // from table if available
      status: "souvenir_sheet",                 // ALWAYS "souvenir_sheet" for SS codes
      notes: string[],                          // describe contents: ["Souvenir sheet with 5 values perf and island map"]
      
      // OPTIONAL: list of stamps contained in the sheet
      sheet_contents?: string[]                 // e.g., ["A747", "A748", "A749", "A750", "A751"]
                                                // Only populate if text explicitly lists which stamps
                                                // Otherwise omit field or use []
    }}

varieties:
  - array of:
    {{
      base_catalog_no: string,                 // base main number (e.g., "31", "1A")
      suffix: string,                          // LOWERCASE suffix only (e.g., "a", "b")
      type: "perforation"|"impression"|"plate_flaw"|"overprint"|"surcharge"|"color"|"color_shift"|"watermark"|"paper"|"gumming"|"other",
      description: string,
      position: string|null,
      plate: number|null
    }}

proofs:
  - die_proofs: [
      {{ code: string, denomination: string, color: string, die_no: string, substrate: string, finish: string }}
    ]
  - plate_proofs: [
      {{ code: string, note: string, items: [ {{ variant: string, denomination: string, color: string, plate: number|null, note: string }} ] }}
    ]
  - color_proofs: [ {{ code: string, denomination: string, color: string, notes: string }} ]
  - imperforate_proofs: [ {{ code: string, denomination: string, notes: string }} ]

essays:
  - [ {{ code: string, medium: string, paper: string, denomination: string, provenance: string[], notes: string[] }} ]

specimens:
  - [ {{ code: string, applies_to: "proofs"|"stamps", type: "overprint"|"punch"|"perfin"|"handstamp"|string, denomination: string, base_color: string, overprint_color: string, notes: string }} ]
postal_stationery: [
  {{
    catalog_no: string,                    // e.g., "PC1", "EN5", "LS1", "OEN2", "W1"
    issue_id: string,
    stationery_type: "postal_card" | "envelope" | "aerogramme" | "official_envelope" | "wrapper",
    denomination: {{
      value: number,
      unit: "c" | "C" | "P"
    }},
    color: string,                         // printed color
    paper: string,                         // e.g., "buff manila", "white laid", "thick paper"
    size: string,                          // dimensions (e.g., "132 x 80 mm", "138 x 80 mm")
    quantity_reported: number|null,
    notes: string[],
    
    // OPTIONAL fields (only when applicable):
    card_type?: "single" | "reply" | "double",  // only for postal_card type
    overprint?: {{                         // for official envelopes with overprints
      present: boolean,
      type: "overprint" | "surcharge",
      text: string,                        // e.g., "Servicio Oficial", "Libre de Porte"
      color: string
    }},
    base_ref?: string                      // if overprinted on another stationery item
  }}
]

CRITICAL: Only populate postal_stationery if the input text explicitly 
mentions postal cards (PC), envelopes (EN), aerogrammes (LS), wrappers (W), 
or letter sheets.

If there is NO mention of postal stationery in the input, return:
"postal_stationery": []

NEVER invent postal stationery data.

--------------------------------------------------------------------------------
HARD SEPARATION: SPECIMENS vs VARIETIES (NEVER MIX)
--------------------------------------------------------------------------------
A) SPECIMENS (S-codes)
- Detector: any line that BEGINS with uppercase "S" + digits + optional lowercase (regex: ^S\\d+[a-z]?$).
- Each S-code MUST yield ONE item in top-level "specimens".
- NEVER place S-codes under "varieties", "proofs", or inside "stamps".
- "applies_to": default "stamps". Use "proofs" ONLY if the text explicitly binds that S-code to proofs.
- "type": default "overprint" unless the line explicitly indicates "punch", "perfin", "handstamp", etc.
- 'Muestra' is an special specimen that is an overprint and always starts with MA, example: "MA46a bk 10c scarlet" → code: MA46a, overprint_color: black, denomination: 10c, base_color: scarlet
- "denomination": take the face value and unit on the same row if present; if not, inherit from the row header ONLY if unambiguous; otherwise "".
- "base_color": color BEFORE the overprint phrase, lowercased.
- "overprint_color": keep orientation adjectives (e.g., "black", "red", "in a black circle").
- "notes": carry remaining qualifiers (e.g., "imperforate; ungummed thin paper with hole"; "Perf 12.5"; "salesman samples").
- IMPORTANT: Even if an S-code mentions “inverted overprint” or “Perf 12.5”, it REMAINS a specimen and stays in "specimens" (not in "varieties").

B) VARIETIES (lowercase suffixes of MAIN numbers)
- Detector: main catalog number + lowercase suffix (e.g., "31a", "33b", "1Aa").
- Keep ONLY these in "varieties".
- Uppercase letter immediately after digits forms a MAIN number (e.g., "1A"): keep in "stamps", not "varieties".
- Typical entries: imperf between, margin imperf, inverted OP of a REGULAR non-S specimen, paper, watermark, color shade, plate flaw.
- **VARIETY BASE NUMBER**:
  Always use catalog number from Column 1 as base_catalog_no, even if variety code in Column 2 
  contains a different number. Example: <tr><td>34</td><td>33d</td></tr> → base_catalog_no: "34", suffix: "d"
Collision resolution:
- If a feature appears both in a REGULAR listing and in an S-code list:
  • Regular listing feature → "varieties".
  • S-code feature → keep inside that S-code item in "specimens" (as type/notes). Do NOT mirror it into "varieties".

--------------------------------------------------------------------------------
PROGRESSIVE DIES & SPECIAL CASES
--------------------------------------------------------------------------------
- Progressive die (e.g., “DPA31a vignette only”): put under "proofs.die_proofs" as:
  {{ code: "DPA31a", denomination: "", color: "", die_no: "", substrate: "", finish: "progressive: vignette only" }}
- Bar cancels: only use stamps[].overprint.type = "bar_cancel" when the main regular line defines it as such.
- Do NOT convert progressive dies or salesman sheets into "varieties".

--------------------------------------------------------------------------------
NON-ISSUED STAMPS (NEA CODES)
--------------------------------------------------------------------------------
When a table shows specimen categories including "without overprint" or 
"non-emis", these are non-issued stamps that should be captured in specimens.

DETECTION:
- Code pattern: NEA\d+[a-z]?
- Table columns showing: I. Without overprint, II. specimen, III. muestra

FIELD ASSIGNMENT:
- code: as written (NEA46, NEA47, etc.)
- applies_to: "stamps"
- type: "non-issued" or "unissued"
- overprint_color: "" (no overprint)
- notes: "non-issued stamp without overprint" or similar

These are distinct from regular specimens (SNEA, MNEA, MA codes).

--------------------------------------------------------------------------------
CONSTANT VARIETIES - EXPANSION RULE
--------------------------------------------------------------------------------
When text contains "Constant [X] varieties in [Y]:" followed by variety list,
these varieties apply to ALL items in the specified range, not just one.

DETECTION PATTERNS:
1. "Constant overprint plate varieties in regular issue"
   → Apply to all regular stamps in the issue
   
2. "Constant varieties in regular issue and 'muestra' overprints"
   → Apply to all regular stamps AND note in muestra specimens
   
3. "Constant plate varieties"
   → Apply to all stamps using that plate

IMPLEMENTATION:
When parsing varieties section, if header says "Constant ... varieties in [scope]":

Step 1: Parse the variety definitions:
  a: description (pos X)
  b: description (pos Y)
  
Step 2: Identify the applicable range:
  - If "in regular issue": apply to ALL stamps with status="regular"
  - If "in stamps A46-A52": apply to that specific range
  - If no range specified: apply to all stamps in current issue
  
Step 3: For EACH stamp in range, create variety entry:
  {{
    "base_catalog_no": "<stamp_catalog_no>",
    "suffix": "<variety_letter>",
    "type": "<variety_type>",
    "description": "<variety_description>",
    "position": "<pos if specified>",
    "plate": <plate_number if known>
  }}

EXAMPLE FROM CURRENT TEST:
Input:
  Regular stamps: A46, A47, A48, A49, A50, A51, A52, A53, A54
  "Constant overprint plate varieties in regular issue:
   a: hyphen missing between 2 and Diciembre (pos 13)
   b: DJA for DIA (pos 49)"

Output varieties[] should contain 18 entries:
  - A46a, A46b
  - A47a, A47b
  - A48a, A48b
  - A49a, A49b
  - A50a, A50b
  - A51a, A51b
  - A52a, A52b
  - A53a, A53b
  - A54a, A54b

All with the same descriptions but different base_catalog_no.

--------------------------------------------------------------------------------
OVERPRINT PROOFS (OP codes)
--------------------------------------------------------------------------------
Codes starting with "OP" (e.g., OPA46, OPB23) are OVERPRINT PROOFS.
These should be placed in "proofs.plate_proofs", NOT in "imperforate_proofs".

--------------------------------------------------------------------------------
UNIT NORMALIZATION
--------------------------------------------------------------------------------
- UNITS normalization:
  • "c", "cent", "centavo", "centavos", "cts", "centime" -> "c"
  • Capital "C" = "Colón", MUST remain "C"
  • "P" = "Peso"; "real"/"reales" -> "reales"


Pattern recognition:
- "1.35C" → unit: "C" (Colón, not centavo)
- "5C" → unit: "C" (Colón)
- "10C" → unit: "C" (Colón)
- "10c" → unit: "c" (centavo | centimo | cts)
- "75c" → unit: "c" (centavo | centimo | cts)

MUST preserve the case of the unit letter from the source.
Never convert "C" → "c" or vice versa.

--------------------------------------------------------------------------------
DENOMINATIONS & SURCHARGES ("/" RULE)
--------------------------------------------------------------------------------
- "2c/10C" means a surcharge: the result denomination is 2c; base was 10C.
- Populate:
  "denomination": {{ "value": 2, "unit": "c" }}
  "overprint": {{
    "present": true,
    "type": "surcharge",
    "surcharge_denomination": {{ "value": 2, "unit": "c" }},
    "on_denomination": {{ "value": 10, "unit": "C" }}
  }}
  
--------------------------------------------------------------------------------
DENOMINATION EXTRACTION - NO AUTO-CONVERSION
--------------------------------------------------------------------------------
CRITICAL: Extract denomination EXACTLY as written in the source.

NEVER convert between units automatically:
- If source says "90c" → value: 90, unit: "c" (NOT 0.9C)
- If source says "0.9C" → value: 0.9, unit: "C" (NOT 90c)
- If source says "2.10C" → value: 2.1, unit: "C"

Even though 100 centavos = 1 Colón in Costa Rica, do NOT perform conversions.
The denomination must reflect what is PRINTED on the stamp, not mathematical equivalents.


--------------------------------------------------------------------------------
SURCHARGE STAMPS - COLOR FIELD EXTRACTION
--------------------------------------------------------------------------------
For stamps with surcharge format: "Xc on Yc COLOR, in COLOR2"

Example: "10c on 15c green, in black"
         "35c on 50c violet, in orange"

PARSING RULES:
1. denomination = Xc (the NEW value after surcharge)
2. color = COLOR (ONLY the color of the BASE stamp)
3. overprint.surcharge_denomination = Xc
4. overprint.on_denomination = Yc
5. overprint.color = COLOR2 (from "in COLOR2")

CRITICAL: The "color" field must contain ONLY the base stamp color.
Do NOT include the entire phrase "on Yc COLOR, in COLOR2".

--------------------------------------------------------------------------------
SOUVENIR SHEETS
--------------------------------------------------------------------------------
Souvenir sheets (SS codes like SSA752, SS123) are special collectible formats.

DETECTION: Codes starting with "SS" followed by optional letter and numbers

STRUCTURE (in stamps[] array):
{{
  "catalog_no": "SSA752",
  "denomination": {{"value": null, "unit": "sheet"}},
  "status": "souvenir_sheet",
  "color": "multicolor",
  "perforation": "10.5",
  "quantity_reported": 40000,
  "notes": "Souvenir sheet with 5 values perf and island map",
  "sheet_contents": []  // optional: ["A747", "A748"] if explicitly stated
}}

VARIETIES: If SSA752a is imperforated version → place in varieties[], not stamps[]:
{{
  "base_catalog_no": "SSA752",
  "suffix": "a",
  "type": "perforation",
  "description": "imperforated"
}}

EXAMPLE:
Input:
  SSA752    sheet with 5 values perf    40,000
  SSA752a   sheet imperforated

Output:
- stamps[]: SSA752 with status: "souvenir_sheet", unit: "sheet", value: null
- varieties[]: SSA752a as perforation variety

--------------------------------------------------------------------------------
SPECIAL PREFIX CODES - IA, IB, IC (Imperforate Variants)
--------------------------------------------------------------------------------
Some catalog systems use prefix codes for special variants:

DETECTION:
Codes starting with "I" followed by regular code: IA722, IB123, IC45

MEANING:
- IA722 = Imperforate version of A722
- Similar to SSA (Souvenir Sheet A), IA (Imperforate A)

CLASSIFICATION:
These are COMPLETE catalog codes, NOT suffixes.

IA722 is NOT the same as A722i or A722a

PLACE AS SEPARATE STAMP:
{{
  "catalog_no": "IA722",
  "denomination": {{...}},
  "perforation": "",  // always imperf for IA codes
  "status": "error" | "printer_waste" | "regular",  // based on context
  "notes": "Imperforate (printer's waste)" or similar
}}

--------------------------------------------------------------------------------
ATM STAMPS (VARIABLE VALUE STAMPS)
--------------------------------------------------------------------------------
ATM stamps (Automaten Marken - Automated Teller Machine stamps) are variable 
value stamps printed by machines. They have no fixed denomination.

DETECTION: Codes starting with "ATM" followed by numbers: ATM9, ATM12, ATM5

STRUCTURE (in stamps[] array):
{{
  "catalog_no": "ATM9",
  "denomination": {{
    "value": null,           // no fixed value
    "unit": "variable"       // special unit for ATM stamps
  }},
  "status": "atm",           // special status
  "color": string,           // if mentioned
  "perforation": "",         // typically imperf or machine cut
  "quantity_reported": null, // variable, not typically reported
  "notes": string            // include description, e.g., "Papagayo Gulf. Variable value stamp. Size 57 x 27 mm"
}}

NOTES FIELD:
Include relevant details:
- Subject/design description
- Size dimensions (e.g., "Size 57 x 27 mm")
- "Variable value stamp" or "ATM stamp"
- Any special characteristics

--------------------------------------------------------------------------------
CHRISTMAS POSTAL TAX STAMPS
--------------------------------------------------------------------------------
Christmas Tax stamps (CT codes) are special postal tax stamps.

DETECTION: Codes starting with "CT" followed by numbers/letters: CT1, CT1A, CT25

STRUCTURE (in stamps[] array):
{{
  "catalog_no": "CT1",
  "denomination": {{
    "value": number,              // the surcharge/final value (e.g., 5)
    "unit": "c" | "C"             // standard units
  }},
  "status": "postal_tax",         // special status for tax stamps
  "color": string,
  "perforation": string,
  "quantity_reported": number|null,
  "notes": string,                // include "Christmas postal tax stamp" + details
  
  // Most CT stamps are surcharges on other stamps:
  "overprint": {{
    "present": true,
    "type": "surcharge",
    "surcharge_denomination": {{value: number, unit: string}},
    "on_denomination": {{value: number, unit: string}},
    "color": string
  }},
  "base_stamp_ref": string        // e.g., "A210"
}}


--------------------------------------------------------------------------------
POSTAL STATIONERY (Unified Category)
--------------------------------------------------------------------------------
Postal stationery are pre-stamped items (cards, envelopes, etc.), separate from 
adhesive stamps. All types go in the "postal_stationery" array.

CODE DETECTION & TYPE MAPPING:
Pattern              stationery_type        Example
^PC\d+[a-z]?$       → "postal_card"        PC1, PC25
^EN\d+[a-z]?$       → "envelope"           EN1, EN12
^LS\d+[a-z]?$       → "aerogramme"         LS1, LS5
^OEN\d+[a-z]?$      → "official_envelope"  OEN1, OEN23
^W\d+[a-z]?$        → "wrapper"            W1, W3

STRUCTURE:
{{
  "catalog_no": "PC1",
  "stationery_type": "postal_card",
  "denomination": {{"value": 2, "unit": "c"}},
  "color": "black",
  "paper": "buff manila",
  "size": "132 x 80 mm",
  "quantity_reported": 50000,
  "notes": [],
  
  // ONLY for postal_card type:
  "card_type": "single" | "reply" | "double"
}}

CARD TYPES (postal_card only):
- "single": Regular postal card
- "reply": Card mentions "with reply card" or "reply card"
- "double": Double-sized reply card format

OFFICIAL ENVELOPES with overprints:
{{
  "catalog_no": "OEN1",
  "stationery_type": "official_envelope",
  "denomination": {{"value": 2, "unit": "c"}},
  "color": "green",
  "notes": ["For use by Secretary of Finance"],
  "overprint": {{
    "present": true,
    "type": "overprint",
    "text": "Servicio Oficial",
    "color": "black"
  }}
}}

PROOFS & SPECIMENS OF POSTAL STATIONERY:
Identified by prefix + stationery type:
- DPPC# = Die Proof Postal Card → proofs.die_proofs[]
- DPEN# = Die Proof Envelope → proofs.die_proofs[]
- DPLS# = Die Proof Aerogramme → proofs.die_proofs[]
- DPOEN# = Die Proof Official Envelope → proofs.die_proofs[]
- DPW# = Die Proof Wrapper → proofs.die_proofs[]

- MPC# = Muestra Postal Card → specimens[]
- MEN# = Muestra Envelope → specimens[]
- MLS# = Muestra Aerogramme → specimens[]
- etc.

For specimens, use applies_to: "postal_stationery"

VARIETIES:
Lowercase suffixes go in varieties[]:
{{
  "base_catalog_no": "EN1",
  "suffix": "a",
  "type": "color" | "impression" | "overprint" | "plate_flaw",
  "description": "pale blue" | "double impression" | "inverted op"
}}

CRITICAL POSTAL STATIONARY NOTES:
- ALL postal stationery types go in ONE "postal_stationery" array
- stationery_type field distinguishes the specific type
- Postal stationery are NOT stamps - don't put in stamps[] array
- Uppercase letter suffixes (EN1A) = main items, not varieties
- Lowercase suffixes (EN1a) = varieties → varieties[] array
- Proofs use DP + type prefix (DPPC, DPEN, DPLS, DPOEN, DPW)
- Specimens use M + type prefix (MPC, MEN, MLS, MOEN, MW)

--------------------------------------------------------------------------------
OFFICIAL STAMPS (Surface Mail & Airmail)
--------------------------------------------------------------------------------
Official stamps are regular postage stamps overprinted for official government 
use. They go in the stamps[] array with special status and overprint structure.

CODE DETECTION:
Surface Mail:     ^O\d+[a-z]?$        → O1, O25, O3a
Airmail:          ^OA\d+[a-z]?$       → OA107, OA115, OA119a

Both types use same structure, only differ in section (Surface Mail vs Airmail).

STRUCTURE (in stamps[] array):
{{
  "catalog_no": "O1" | "OA107",
  "issue_id": string,
  "denomination": {{
    "value": number,
    "unit": "c" | "C" | "P" | "reales"
  }},
  "color": string,                      // color of base stamp
  "perforation": string,
  "quantity_reported": number|null,
  "status": "official",                 // same status for O and OA
  "notes": string[],
  
  "overprint": {{
    "present": true,
    "type": "overprint",
    "text": string,                       // e.g., "Oficial", "OFFICIAL"
    "color": string                     // color of overprint
  }},
  "base_stamp_ref": string              // reference to base stamp
}}

OVERPRINT EXTRACTION:
- From text: "Overprint 'Oficial' in red" → text: "Oficial", color: "red"
- Color format: "5c green, in red" → base: green, overprint: red
- If no explicit text, use: "Official use overprint"

SPECIMENS: SO# (surface) and SOA# (airmail) → specimens[] with applies_to: "stamps"
VARIETIES: Lowercase suffixes (O2a, OA115a) → varieties[]
SECTION: O# → "Surface Mail", OA# → "Airmail"

EXAMPLE:

Input:
  Overprint Issue of 1934
  Overprint "Oficial" in red. Perf 12.
  
  Overprint "specimen" in red
  SOA107   5c green
  
  Regular issue
  OA107    5c green       75,000
  OA108    10c carmine rose   35,000
  OA108a   inverted overprint

Output:
{{
  "issue_data": {{"section": "Airmail", ...}},
  "stamps": [
    {{
      "catalog_no": "OA107",
      "denomination": {{"value": 5, "unit": "c"}},
      "color": "green",
      "perforation": "12",
      "quantity_reported": 75000,
      "status": "official",
      "notes": ["Official airmail stamp"],
      "overprint": {{"present": true, "type": "overprint", "text": "Oficial", "color": "red"}},
      "base_stamp_ref": "A107"
    }},
    {{
      "catalog_no": "OA108",
      "denomination": {{"value": 10, "unit": "c"}},
      "color": "carmine rose",
      "perforation": "12",
      "quantity_reported": 35000,
      "status": "official",
      "overprint": {{"present": true, "type": "overprint", "text": "Oficial", "color": "red"}},
      "base_stamp_ref": "A108"
    }}
  ],
  "varieties": [
    {{"base_catalog_no": "OA108", "suffix": "a", "type": "overprint", "description": "inverted overprint"}}
  ],
  "specimens": [
    {{"code": "SOA107", "applies_to": "stamps", "type": "overprint", "denomination": "5c", "base_color": "green", "overprint_color": "red", "notes": "SPECIMEN overprint"}}
  ]
}}

CRITICAL FOR OFFICIAL STAMPS: O# and OA# both use status: "official" and go in stamps[] array.
Base ref: remove O/OA prefix (OA107 → A107, O5 → 5).

--------------------------------------------------------------------------------
GUANACASTE OVERPRINTS
--------------------------------------------------------------------------------
Guanacaste stamps are regular postage or revenue stamps overprinted with 
"Guanacaste" for use in Guanacaste Province (1885-1892). They go in stamps[] 
array with special status and overprint structure.

CODE DETECTION:
Postage:    ^G\d+[a-z]?$        → G1, G5, G12a
Revenue:    ^GR\d+[a-z]?$       → GR1, GR5, GR8a

STRUCTURE (in stamps[] array):
{{
  "catalog_no": "G1" | "GR1",
  "issue_id": string,
  "denomination": {{
    "value": number,
    "unit": "c" | "C" | "P" | "reales"
  }},
  "color": string,                      // color of base stamp
  "perforation": string,
  "watermark": string|null,
  "quantity_reported": number|null,
  "status": "guanacaste" | "guanacaste_revenue",
  "notes": string[],
  
  "overprint": {{
    "present": true,
    "type": "overprint",
    "text": "Guanacaste",
    "color": "black" | "red"            // extracted from section header
  }},
  "base_stamp_ref": string              // reference to base stamp
}}

STATUS VALUES:
- G# codes → status: "guanacaste"
- GR# codes → status: "guanacaste_revenue"

OVERPRINT COLOR:
Extract from section headers:
- "Black Overprint" → all following stamps have color: "black"
- "Red Overprint" → all following stamps have color: "red"

PLATE ERRORS (varieties):
When text describes plate errors with positions:
"a: first A broken (pos 19)" → create variety with position: 19

{{
  "base_catalog_no": "G1",
  "suffix": "a",
  "type": "plate_flaw",
  "description": "first A broken",
  "position": 19,
  "plate": 1
}}

VARIETIES:
- Lowercase suffixes (G1a, GR4a) go in varieties[]
- Include position when mentioned (pos 19, pos 37, etc.)
- type: usually "plate_flaw" or "impression"

GUANACASTE CRITICAL NOTES:
- G# and GR# codes go in stamps[] array
- G# → status: "guanacaste"
- GR# → status: "guanacaste_revenue"
- overprint.text always "Guanacaste"
- Overprint color from section headers (Black/Red Overprint)
- Plate errors with positions go in varieties[] with position field populated
- Base ref: G1 → "1", GR1 → "R1"

--------------------------------------------------------------------------------
SEMIPOSTAL STAMPS
--------------------------------------------------------------------------------
Semipostal stamps are postage stamps sold at a premium above face value, with 
proceeds benefiting charitable causes. They go in stamps[] array with special 
status and optional surcharge structure.

CODE DETECTION:
Regular:    ^SP\d+[a-z]?$        → SP1, SP2, SP4a
Imperf:     ^ISP\d+[a-z]?$       → ISP2, ISP4a

STRUCTURE (in stamps[] array):
{{
  "catalog_no": "SP1" | "ISP2",
  "issue_id": string,
  "denomination": {{
    "value": number,
    "unit": "c" | "C" | "P"
  }},
  "color": string,
  "perforation": string,              // "" for ISP codes
  "quantity_reported": number|null,
  "status": "semipostal",             // for both SP and ISP
  "notes": string[],                  // include benefit purpose and premium
  
  // ONLY when surcharged:
  "overprint": {{
    "present": true,
    "type": "surcharge",
    "surcharge_denomination": {{value: number, unit: string}},
    "on_denomination": {{value: number, unit: string}},
    "color": string
  }},
  "base_stamp_ref": string
}}

NOTES FIELD:
Include charitable purpose and premium amount:
- "Sold with 10c premium for Olympic Games Committee benefit"
- "Red Cross Society benefit. Premium: 5c"

ISP CODES (Imperforates):
- ISP# are separate catalog items, NOT varieties
- status: "semipostal" (same as SP)
- perforation: "" (empty for imperf)

PROOFS:
- DPSP# → proofs.die_proofs[]
- OPSP# → proofs.overprint_proofs[]
- Color proofs in combined format: "SP2-4a brown red" means one proof sheet 
  with SP2, SP3, SP4 in same color → one entry in color_proofs[]

VARIETIES:
Lowercase suffixes (SP1a, ISP4a) go in varieties[]:
- "tete-beche" → type: "arrangement"
- "lower surcharge" → type: "overprint"
- "shifted perf" → type: "perforation"

EXAMPLE:

Input:
  Red Cross Society benefit surcharge
  October 17, 1922. Surcharge on 1910 stamp. Perf 12.
  
  Overprint Proof
  OPSP1  5c red on onionskin paper
  
  Regular issue
  SP1    5c on 5c orange (68)    200,000
  SP1a   lower surcharge
  SP1b   surcharge displaced upwards
  
  Sold with 5c premium for Red Cross benefit.
  
  Olympic Games benefit
  Die Proofs
  DPSP2  5c black
  DPSP3  10c black
  
  Three values in a sheet
  SP2-4a brown red
  SP2-4b black
  
  Imperforate
  ISP2   5c dark green    15,000
  ISP3   10c carmine      15,000
  
  Regular issue
  SP2    5c dark green    15,000
  SP3    10c carmine      15,000
  SP4    20c dark blue    15,000
  SP4a   vertical pair tete beche
  
  Sold with 10c surcharge for Olympic Games Committee benefit.

Output:
{{
  "stamps": [
    {{
      "catalog_no": "SP1",
      "denomination": {{"value": 5, "unit": "c"}},
      "color": "orange",
      "perforation": "12",
      "quantity_reported": 200000,
      "status": "semipostal",
      "notes": ["Red Cross Society benefit. Sold with 5c premium"],
      "overprint": {{
        "present": true,
        "type": "surcharge",
        "surcharge_denomination": {{"value": 5, "unit": "c"}},
        "on_denomination": {{"value": 5, "unit": "c"}},
        "color": "red"
      }},
      "base_stamp_ref": "68"
    }},
    {{
      "catalog_no": "ISP2",
      "denomination": {{"value": 5, "unit": "c"}},
      "color": "dark green",
      "perforation": "",
      "quantity_reported": 15000,
      "status": "semipostal",
      "notes": ["Olympic Games benefit. Sold with 10c premium. Imperforate"]
    }},
    {{
      "catalog_no": "ISP3",
      "denomination": {{"value": 10, "unit": "c"}},
      "color": "carmine",
      "perforation": "",
      "quantity_reported": 15000,
      "status": "semipostal",
      "notes": ["Olympic Games benefit. Sold with 10c premium. Imperforate"]
    }},
    {{
      "catalog_no": "SP2",
      "denomination": {{"value": 5, "unit": "c"}},
      "color": "dark green",
      "perforation": "12",
      "quantity_reported": 15000,
      "status": "semipostal",
      "notes": ["Olympic Games benefit. Sold with 10c premium"]
    }},
    {{
      "catalog_no": "SP3",
      "denomination": {{"value": 10, "unit": "c"}},
      "color": "carmine",
      "perforation": "12",
      "quantity_reported": 15000,
      "status": "semipostal",
      "notes": ["Olympic Games benefit. Sold with 10c premium"]
    }},
    {{
      "catalog_no": "SP4",
      "denomination": {{"value": 20, "unit": "c"}},
      "color": "dark blue",
      "perforation": "12",
      "quantity_reported": 15000,
      "status": "semipostal",
      "notes": ["Olympic Games benefit. Sold with 10c premium"]
    }}
  ],
  "varieties": [
    {{
      "base_catalog_no": "SP1",
      "suffix": "a",
      "type": "overprint",
      "description": "lower surcharge",
      "position": null,
      "plate": null
    }},
    {{
      "base_catalog_no": "SP1",
      "suffix": "b",
      "type": "overprint",
      "description": "surcharge displaced upwards",
      "position": null,
      "plate": null
    }},
    {{
      "base_catalog_no": "SP4",
      "suffix": "a",
      "type": "arrangement",
      "description": "vertical pair tete-beche",
      "position": null,
      "plate": null
    }}
  ],
  "proofs": {{
    "die_proofs": [
      {{"code": "DPSP2", "denomination": "5c", "color": "black", "substrate": "bond paper", ...}},
      {{"code": "DPSP3", "denomination": "10c", "color": "black", "substrate": "bond paper", ...}}
    ],
    "overprint_proofs": [
      {{"code": "OPSP1", "denomination": "5c", "color": "red", "substrate": "onionskin paper", ...}}
    ],
    "color_proofs": [
      {{"code": "SP2-4a", "denomination": "5c/10c/20c", "color": "brown red", "notes": "Three values in a sheet"}},
      {{"code": "SP2-4b", "denomination": "5c/10c/20c", "color": "black", "notes": "Three values in a sheet"}}
    ]
  }}
}}

SEMI POSTAL CRITICAL NOTES:
- SP# and ISP# codes go in stamps[] array
- Both use status: "semipostal"
- ISP# are imperf stamps, NOT varieties
- Notes must include benefit purpose and premium amount
- Combined color proofs (SP2-4a) → one entry in color_proofs[]
- Varieties (SP1a, ISP4a) go in varieties[]
- Proofs: DPSP#, OPSP# go in respective proof sections

--------------------------------------------------------------------------------
POSTAGE DUE STAMPS
--------------------------------------------------------------------------------
Postage due stamps are used to collect unpaid or underpaid postage. They go in 
stamps[] array with special status.

CODE DETECTION:
Regular:    ^D\d+[a-z]?$         → D1, D8, D15a
Specimens:  ^SD\d+[a-z]?$        → SD1, SD4a
Proofs:     ^DPD\d+$             → DPD1

STRUCTURE (in stamps[] array):
{{
  "catalog_no": "D1",
  "issue_id": string,
  "denomination": {{
    "value": number,
    "unit": "c" | "C" | "P"
  }},
  "color": string,
  "perforation": string,
  "watermark": string|null,
  "quantity_reported": number|null,
  "status": "postage_due",
  "notes": string[]
}}

SPECIMENS:
SD# codes go in specimens[] array:
{{
  "code": "SD1",
  "applies_to": "stamps",
  "type": "overprint",
  "denomination": "5c",
  "base_color": string,
  "overprint_color": "black" | "purple",
  "notes": "SPECIMEN overprint" or "Waterlow & Sons Ltd/Specimen"
}}

PROOFS:
DPD# codes go in proofs.die_proofs[]:
{{
  "code": "DPD1",
  "denomination": string or "no numeral",
  "color": "black",
  "die_no": string,
  "substrate": "bond paper",
  "finish": ""
}}

VARIETIES:
Lowercase suffixes (D1a, SD4a) go in varieties[] for D codes.
For SD codes with lowercase, they may be separate specimens if significantly 
different (color/perforation changes).

EXAMPLE:

Input:
  Issue of 1903
  September 10, 1903. Decree #53. Engraved by Waterlow & Sons. Perf 14, 15.
  
  Die Proof
  DPD1  black, no numeral, bond paper #3428
  
  Overprint "specimen" in black, numerals in black
  SD1   5c slate blue
  SD2   10c brown orange
  SD4   20c carmine
  
  Overprint "Waterlow & Sons Ltd/Specimen"
  SD4a  20c red orange, imperf
  SD4b  20c red orange, perf 12.5
  
  Regular issue
  D1    5c slate blue
  D2    10c brown orange
  D3    15c yellow green
  D4    20c carmine
  D5    25c slate gray

Output:
{{
  "issue_data": {{
    "issue_id": "CR-1903-POSTAGE-DUE",
    "section": "Postage Due",
    ...
  }},
  "stamps": [
    {{
      "catalog_no": "D1",
      "denomination": {{"value": 5, "unit": "c"}},
      "color": "slate blue",
      "perforation": "14",
      "quantity_reported": null,
      "status": "postage_due",
      "notes": ["Postage due stamp"]
    }},
    {{
      "catalog_no": "D2",
      "denomination": {{"value": 10, "unit": "c"}},
      "color": "brown orange",
      "perforation": "14",
      "status": "postage_due",
      "notes": ["Postage due stamp"]
    }},
    {{
      "catalog_no": "D3",
      "denomination": {{"value": 15, "unit": "c"}},
      "color": "yellow green",
      "perforation": "14",
      "status": "postage_due",
      "notes": ["Postage due stamp"]
    }},
    {{
      "catalog_no": "D4",
      "denomination": {{"value": 20, "unit": "c"}},
      "color": "carmine",
      "perforation": "14",
      "status": "postage_due",
      "notes": ["Postage due stamp"]
    }},
    {{
      "catalog_no": "D5",
      "denomination": {{"value": 25, "unit": "c"}},
      "color": "slate gray",
      "perforation": "14",
      "status": "postage_due",
      "notes": ["Postage due stamp"]
    }}
  ],
  "proofs": {{
    "die_proofs": [
      {{
        "code": "DPD1",
        "denomination": "no numeral",
        "color": "black",
        "die_no": "3428",
        "substrate": "bond paper",
        "finish": ""
      }}
    ]
  }},
  "specimens": [
    {{
      "code": "SD1",
      "applies_to": "stamps",
      "type": "overprint",
      "denomination": "5c",
      "base_color": "slate blue",
      "overprint_color": "black",
      "notes": "SPECIMEN overprint"
    }},
    {{
      "code": "SD2",
      "applies_to": "stamps",
      "type": "overprint",
      "denomination": "10c",
      "base_color": "brown orange",
      "overprint_color": "black",
      "notes": "SPECIMEN overprint"
    }},
    {{
      "code": "SD4",
      "applies_to": "stamps",
      "type": "overprint",
      "denomination": "20c",
      "base_color": "carmine",
      "overprint_color": "black",
      "notes": "SPECIMEN overprint"
    }},
    {{
      "code": "SD4a",
      "applies_to": "stamps",
      "type": "overprint",
      "denomination": "20c",
      "base_color": "red orange",
      "overprint_color": "black",
      "notes": "Waterlow & Sons Ltd/Specimen. Imperf"
    }},
    {{
      "code": "SD4b",
      "applies_to": "stamps",
      "type": "overprint",
      "denomination": "20c",
      "base_color": "red orange",
      "overprint_color": "black",
      "notes": "Waterlow & Sons Ltd/Specimen. Perf 12.5"
    }}
  ]
}}

POSTAGE DUES CRITICAL NOTES:
- D# codes go in stamps[] with status: "postage_due"
- SD# codes go in specimens[] (NOT stamps[])
- DPD# codes go in proofs.die_proofs[]
- SD codes with suffixes (SD4a, SD4b) are separate specimen items if they have 
  different perforation/color, NOT varieties
- Section: "Postage Due" for issue_data

--------------------------------------------------------------------------------
SPECIAL DELIVERY STAMPS
--------------------------------------------------------------------------------
Special delivery stamps (Entrega Inmediata) are used for express mail service. 
They go in stamps[] array with special status.

CODE DETECTION:
Regular:       ^SD\d+[a-z]?$         → SD3, SD5, SD6a
Color Proofs:  ^CPSD\d+$             → CPSD3

STRUCTURE (in stamps[] array):
{{
  "catalog_no": "SD3",
  "issue_id": string,
  "denomination": {{
    "value": number,
    "unit": "c" | "C" | "P"
  }},
  "color": string,
  "perforation": string,
  "quantity_reported": number|null,
  "status": "special_delivery",
  "notes": string[]                   // include service type (local/international)
}}

NOTES FIELD:
Include service type when mentioned:
- "Special delivery. Local rate"
- "Special delivery. International rate"
- "Special delivery. Local and U.P.A.E. countries rate"

PROOFS:
CPSD# codes go in proofs.color_proofs[]:
{{
  "code": "CPSD3",
  "denomination": "75c",
  "color": "black",
  "notes": "Color proof"
}}

VARIETIES:
Lowercase suffixes (SD5a, SD6a) go in varieties[]:
- "triple impression" → type: "impression"
- "imperf left margin" → type: "perforation"
- "double impression" → type: "impression"

SPECIAL DELIVERY STAMPS CRITICAL NOTES:
- SD# codes go in stamps[] with status: "special_delivery"
- CPSD# codes go in proofs.color_proofs[]
- Include service type in notes (local/international/U.P.A.E.)
- Varieties (SD5a, SD6b) go in varieties[]
- Section: "Special Delivery" or can be included in "Surface Mail"/"Airmail"

--------------------------------------------------------------------------------
POSTAL RELATED REVENUE STAMPS
--------------------------------------------------------------------------------
These are stamps with postal-revenue connections:
1. PR# = Postal stamps surcharged/overprinted for Revenue/Fiscal use (various types: 
   regular fiscal, electoral, archive, etc.)
2. R# = Revenue stamps used for Postal purposes

CODE DETECTION:
Postal to Revenue:         ^PR\d+[a-z]?$       → PR4, PR12
Revenue to Postal:         ^R\d+[a-z]?$        → R23, R25
Specimens:                 ^MPR\d+[a-z]?$      → MPR12a
Surcharge Proofs:          ^SPPR\d+[a-z]?$     → SPPR13a

STRUCTURE (in stamps[] array):

FOR PR# (Postal stamps converted to revenue use):
{{
  "catalog_no": "PR4",
  "denomination": {{"value": number, "unit": "c"|"C"|"P"}},
  "color": string,
  "quantity_reported": number|null,
  "status": "postal_revenue",
  "notes": string[],                    // Include type: "Electoral stamp", "Archive stamp", etc.
  
  "overprint": {{
    "present": true,
    "type": "surcharge" | "overprint",
    "text": string,                     // e.g., "Elecciones/1946", "Timbre de/Archivo"
    "surcharge_denomination": {{...}},  // only for surcharges
    "on_denomination": {{...}},         // only for surcharges
    "color": string
  }},
  "base_stamp_ref": string
}}

FOR R# (Revenue stamps used for postage):
{{
  "catalog_no": "R23",
  "denomination": {{"value": number, "unit": "c"|"C"|"P"}},
  "color": string,
  "status": "revenue_postal",
  "notes": ["Revenue stamp used for postage without authorization"]
}}

SPECIMENS & PROOFS:
- MPR# → specimens[] with applies_to: "stamps"
- SPPR# → proofs.surcharge_proofs[] or proofs.overprint_proofs[]

NOTES FIELD for PR#:
Include specific use type:
- "Regular postal stamp surcharged for fiscal use"
- "Electoral stamp. National Exposition overprinted Elecciones/1946"
- "Archive stamp. Postage due surcharged Timbre de/Archivo"

EXAMPLE:

Input:
  REGULAR FISCAL USE
  1947. Airmail stamps surcharged "Timbre Fiscal/1947/ Dos Colones".
  PR4  2C on 5C black, in red      25,500
  
  ELECTORAL STAMPS
  1946. National Exposition stamps overprinted "Elecciones/1946", in black.
  Overprint "muestra"
  MPR12a  2c gray black
  MPR12b  3c red orange, no date
  Regular issue
  PR12    2c gray black
  
  ARCHIVE STAMPS
  1946. Postage due stamps surcharged "Timbre de/ Archivo" and value, in blue.
  Surcharge Proofs
  SPPR13a  10c on 10c violet, in black
  SPPR13b  10c on 10c violet, in red

Output:
{{
  "stamps": [
    {{
      "catalog_no": "PR4",
      "denomination": {{"value": 2, "unit": "C"}},
      "color": "black",
      "quantity_reported": 25500,
      "status": "postal_revenue",
      "notes": ["Regular airmail stamp surcharged for fiscal use. Timbre Fiscal 1947"],
      "overprint": {{
        "present": true,
        "type": "surcharge",
        "text": "Timbre Fiscal/1947/ Dos Colones",
        "surcharge_denomination": {{"value": 2, "unit": "C"}},
        "on_denomination": {{"value": 5, "unit": "C"}},
        "color": "red"
      }},
      "base_stamp_ref": "A26"
    }},
    {{
      "catalog_no": "PR12",
      "denomination": {{"value": 2, "unit": "c"}},
      "color": "gray black",
      "status": "postal_revenue",
      "notes": ["Electoral stamp. National Exposition overprinted Elecciones/1946"],
      "overprint": {{
        "present": true,
        "type": "overprint",
        "text": "Elecciones/1946",
        "color": "black"
      }},
      "base_stamp_ref": "A31"
    }}
  ],
  "specimens": [
    {{
      "code": "MPR12a",
      "applies_to": "stamps",
      "type": "overprint",
      "denomination": "2c",
      "base_color": "gray black",
      "overprint_color": "black",
      "notes": "MUESTRA overprint. Electoral stamp"
    }},
    {{
      "code": "MPR12b",
      "applies_to": "stamps",
      "type": "overprint",
      "denomination": "3c",
      "base_color": "red orange",
      "overprint_color": "black",
      "notes": "MUESTRA overprint. Electoral stamp, no date"
    }}
  ],
  "proofs": {{
    "surcharge_proofs": [
      {{
        "code": "SPPR13a",
        "denomination": "10c on 10c",
        "color": "violet",
        "surcharge_color": "black",
        "notes": "Surcharge proof. Archive stamp"
      }},
      {{
        "code": "SPPR13b",
        "denomination": "10c on 10c",
        "color": "violet",
        "surcharge_color": "red",
        "notes": "Surcharge proof. Archive stamp"
      }}
    ]
  }}
}}

POSTAL RELATED REVENUES CRITICAL NOTES:
- PR# = Postal → Revenue (various fiscal uses: regular, electoral, archive)
- R# = Revenue → Postal
- PR# codes have status: "postal_revenue" WITH overprint/surcharge
- R# codes have status: "revenue_postal" WITHOUT overprint
- MPR# specimens → specimens[] with applies_to: "stamps"
- SPPR# proofs → proofs.surcharge_proofs[] or proofs.overprint_proofs[]
- Notes should specify type: electoral, archive, regular fiscal, etc.

--------------------------------------------------------------------------------
TELEGRAPH STAMPS AND SEALS
--------------------------------------------------------------------------------
Telegraph items include regular telegraph stamps (T#), telegraph seals (TS#), 
and radiogram seals (RS#). All go in stamps[] array with appropriate status.

CODE DETECTION:
Telegraph Stamps:      ^T\d+[A-Z]?[a-z]?$     → T1, T2A, T3a
Telegraph Seals:       ^TS\d+[a-z]?$          → TS1, TS5, TS8a
Radiogram Seals:       ^RS\d+[a-z]?$          → RS1, RS6
Imperf Radiogram:      ^IRS\d+[a-z]?$         → IRS1

Proofs:
Die Proofs Telegraph:       ^DPT\d+[a-z]?$     → DPT1
Plate Proofs Telegraph:     ^PPT\d+[a-z]?$     → PPT1
Plate Proofs Telegraph Seal: ^PTS\d+[a-z]?$    → PTS2, PTS5a
Specimens Telegraph:        ^ST\d+[a-z]?$      → ST1

IMPORTANT: Uppercase letter suffixes (T2A) are separate stamps, not varieties.
Lowercase suffixes (TS2a, RS5a) are varieties.

STRUCTURE (in stamps[] array):

FOR T# (Telegraph Stamps):
{{
  "catalog_no": "T2" | "T2A",
  "denomination": {{"value": number, "unit": "c"|"C"|"P"}},
  "color": string,
  "perforation": string,
  "watermark": string|null,
  "status": "telegraph",
  "notes": ["Telegraph stamp"]
}}

FOR TS# (Telegraph Seals):
{{
  "catalog_no": "TS2",
  "denomination": {{"value": null, "unit": "seal"}},  // seals have no denomination
  "color": string,
  "paper": string,                    // "white paper", "pink paper", etc.
  "perforation": string,
  "status": "telegraph_seal",
  "notes": ["Telegraph seal"]
}}

FOR RS# and IRS# (Radiogram Seals):
{{
  "catalog_no": "RS1" | "IRS1",
  "denomination": {{"value": null, "unit": "seal"}},
  "color": string,
  "paper": string,                    // paper color
  "perforation": string,              // often ""
  "status": "radiogram_seal",
  "notes": ["Radiogram seal. Design with CR/RN in center"]
}}

PROOFS:
- DPT#/PPT# → telegraph stamp proofs
- PTS# → telegraph seal proofs (proofs.plate_proofs[])

EXAMPLE:

Input:
  TELEGRAPH SEALS
  Lithography by Litografia Nacional. Perf 12.
  
  Proofs, imperf
  PTS2   blue, white paper
  PTS2a  blue, pink paper
  PTS5   dark blue, white paper
  
  Regular issue
  TS1    blue
  TS2    light blue, perf 12
  TS2a   horizontal pair imperf between
  TS3    black, imperf
  TS5    dark blue, white paper
  TS8    blue, pink paper
  
  RADIOGRAM SEALS
  "Radios de Costa Rica". "CR" in center. Imperforate.
  
  IRS1   brown, yellow paper
  RS1    brown, yellow paper
  RS2    reddish brown, pink paper
  RS6    brown, pink paper

Output:
{{
  "stamps": [
    {{
      "catalog_no": "TS1",
      "denomination": {{"value": null, "unit": "seal"}},
      "color": "blue",
      "paper": "",
      "perforation": "12",
      "status": "telegraph_seal",
      "notes": ["Telegraph seal"]
    }},
    {{
      "catalog_no": "TS2",
      "denomination": {{"value": null, "unit": "seal"}},
      "color": "light blue",
      "paper": "",
      "perforation": "12",
      "status": "telegraph_seal",
      "notes": ["Telegraph seal"]
    }},
    {{
      "catalog_no": "TS3",
      "denomination": {{"value": null, "unit": "seal"}},
      "color": "black",
      "paper": "",
      "perforation": "",
      "status": "telegraph_seal",
      "notes": ["Telegraph seal. Imperf"]
    }},
    {{
      "catalog_no": "TS5",
      "denomination": {{"value": null, "unit": "seal"}},
      "color": "dark blue",
      "paper": "white paper",
      "perforation": "12",
      "status": "telegraph_seal",
      "notes": ["Telegraph seal"]
    }},
    {{
      "catalog_no": "TS8",
      "denomination": {{"value": null, "unit": "seal"}},
      "color": "blue",
      "paper": "pink paper",
      "perforation": "12",
      "status": "telegraph_seal",
      "notes": ["Telegraph seal"]
    }},
    {{
      "catalog_no": "IRS1",
      "denomination": {{"value": null, "unit": "seal"}},
      "color": "brown",
      "paper": "yellow paper",
      "perforation": "",
      "status": "radiogram_seal",
      "notes": ["Radiogram seal. Radios de Costa Rica. Imperf"]
    }},
    {{
      "catalog_no": "RS1",
      "denomination": {{"value": null, "unit": "seal"}},
      "color": "brown",
      "paper": "yellow paper",
      "perforation": "",
      "status": "radiogram_seal",
      "notes": ["Radiogram seal. Radios de Costa Rica. Imperf"]
    }},
    {{
      "catalog_no": "RS2",
      "denomination": {{"value": null, "unit": "seal"}},
      "color": "reddish brown",
      "paper": "pink paper",
      "perforation": "",
      "status": "radiogram_seal",
      "notes": ["Radiogram seal. Design with CR in center. Imperf"]
    }},
    {{
      "catalog_no": "RS6",
      "denomination": {{"value": null, "unit": "seal"}},
      "color": "brown",
      "paper": "pink paper",
      "perforation": "",
      "status": "radiogram_seal",
      "notes": ["Radiogram seal. Design with RN in center. Imperf"]
    }}
  ],
  "varieties": [
    {{
      "base_catalog_no": "TS2",
      "suffix": "a",
      "type": "perforation",
      "description": "horizontal pair imperf between",
      "position": null,
      "plate": null
    }}
  ],
  "proofs": {{
    "plate_proofs": [
      {{"code": "PTS2", "denomination": "seal", "color": "blue", "notes": "Telegraph seal proof. White paper. Imperf"}},
      {{"code": "PTS2a", "denomination": "seal", "color": "blue", "notes": "Telegraph seal proof. Pink paper. Imperf"}},
      {{"code": "PTS5", "denomination": "seal", "color": "dark blue", "notes": "Telegraph seal proof. White paper. Imperf"}}
    ]
  }}
}}

TELEGRAPH STAMPS AND SEALS CRITICAL NOTES:
- T# → status: "telegraph" (stamps with denominations)
- TS# → status: "telegraph_seal" (seals without denominations)
- RS#/IRS# → status: "radiogram_seal" (radio service seals)
- Seals have denomination.unit: "seal" with value: null
- Paper color is important field for seals
- T#A (uppercase) = separate stamps in stamps[]
- TS#a/RS#a (lowercase) = varieties in varieties[]
- PTS# proofs → proofs.plate_proofs[]
- IRS# are complete catalog codes, NOT varieties

--------------------------------------------------------------------------------
QUANTITIES & DATES
--------------------------------------------------------------------------------
- Tables of print runs by denomination → "production_orders.printings".
- DO NOT assign table totals to "stamps[*].quantity_reported" unless that SPECIFIC catalog number line states a quantity.
- Ignore placeholder zeros: if a table shows 0 for a denomination/period, omit that row.
- Remainders (Mint/Used) → "production_orders.remainders.quantities" with the Mint/Used tag in "plate_desc".
- Dates: ISO (YYYY-MM-DD). If only month/year, use first day of month. Do NOT invent "probable_first_circulation" unless explicitly stated.

--------------------------------------------------------------------------------
PERFORATIONS
--------------------------------------------------------------------------------
- "issue_data.perforation" can contain a range summary (e.g., "13.5-15.5") IF stated.
- "stamps[*].perforation" should contain a specific gauge if uniquely stated for that stamp; otherwise "" (do not copy the range blindly).
- Do NOT include the word "perf" in this field (gauge only).

--------------------------------------------------------------------------------
POST-PARSE VALIDATION CHECKLIST (MANDATORY)
--------------------------------------------------------------------------------
Before emitting JSON, ensure ALL are true:
1) All S-codes are present EXCLUSIVELY in top-level "specimens". There are NO S-codes in "varieties", "proofs", or "stamps".
2) "varieties" ONLY contains lowercase-suffix items of real main catalog numbers (e.g., "31a"). No S-codes here.
3) "stamps[*].quantity_reported" is null unless a catalog line gives a specific quantity for that exact number.
4) Progressive dies (e.g., "DPA…") are in "proofs.die_proofs" with finish starting "progressive:".
5) No placeholder zeros were recorded in "production_orders.printings".
6) All required top-level keys exist and are arrays/objects per schema.
7) Notes fields are strings or arrays of strings (no nested objects).
8) No extra keys beyond the schema.

--------------------------------------------------------------------------------
JSON FORMAT GUARDRAILS
--------------------------------------------------------------------------------
- Return ONLY the JSON object (no commentary or markdown).
- Never emit code inside values.
- Unknowns → "", null, [], or {{}} per schema.
- If multiple notes are present, return an array of strings. If a single note is present, a single string is acceptable.

"""

        final_prompt = ChatPromptTemplate.from_messages([
            ("system", system_prompt),
            few_shot,
            ("human", "{input}")
        ])

        return final_prompt | self.llm | self.output_parser




In [None]:
final_test_text = ""
for grouped_chunk in total_grouped_chunks[1:2]:
    #print(grouped_chunk['header'])
    chunks = grouped_chunk['chunks']
    final_text = ""
    for chunk in chunks:
        if chunk['type'] == 'attestation' or chunk['type'] == 'marginalia':
            pass
        else:
            final_text += clean_chunk_text(chunk['markdown'])
    print("------- TEXTO FIGURAS ----------")
    print(extraer_texto_entre_marcadores(final_text))
    print("------- PLAIN TEXT -------------")
    final_test_text = eliminar_texto_entre_marcadores(final_text)
    print(final_test_text)
    break

In [None]:
# ----------------------------
# Quick test / demo
# ----------------------------


TEST = final_test_text


def test_mena_parser():
    parser = MenaParser(
        openai_api_key=os.getenv("OPENAI_API_KEY", "your-api-key"),
        model_name="gpt-5-mini", #gpt-4o-mini #"gpt-4.1-mini"
        temperature=1, #0.0 para los demas       
        
    )
    result = parser.parse_chunk(TEST)
    print(json.dumps(result, indent=2, ensure_ascii=False))
    with open("mena_parse_result.json", "w", encoding="utf-8") as f:
        json.dump(result, f, indent=2, ensure_ascii=False)
    print("Saved to mena_parse_result.json")
    return result


if __name__ == "__main__":
    test_mena_parser()

In [None]:
len(total_grouped_chunks)

## Codigo Batch Parser Mena

In [None]:
# -*- coding: utf-8 -*-
import os, json, time, datetime, traceback
from time import sleep
from concurrent.futures import ThreadPoolExecutor, as_completed
from itertools import islice
from typing import List, Dict, Any
from tqdm import tqdm

# =========================================================
# Configuración de tu parser
# =========================================================
parser = MenaParser(
    openai_api_key=os.getenv("OPENAI_API_KEY", "your-api-key"),
    model_name="gpt-5-mini",   # ajusta si usas otro
    temperature=1.0,           # 0.0 para más determinismo
)

# =========================================================
# Parámetros de ejecución
# =========================================================
start_num        = 800 + 1
final_num        = 838
start_idx        = start_num - 1
subbatch_size    = 8        # cuántos items por oleada
max_workers      = 4        # concurrencia por oleada
max_retries      = 2        # reintentos por item
backoff_base_sec = 2        # 2, 4, 8... (si aumentas intentos)

# Archivos de salida
OUT_DIR          = "results/parsed_catalogues"
OUT_OK           = os.path.join(OUT_DIR, f"mena_parse_results_{start_num}-{final_num}.json")
OUT_ERR          = os.path.join(OUT_DIR, f"mena_parse_errors_{start_num}-{final_num}.json")

# =========================================================
# Helpers
# =========================================================
def chunked(it, size):
    """Particiona un iterable en bloques (listas) de 'size' elementos."""
    it = iter(it)
    while True:
        batch = list(islice(it, size))
        if not batch:
            break
        yield batch

def preparar_texto(group_chunk: Dict[str, Any]) -> str:
    """
    Concatena los 'markdown' de cada chunk (excepto tipos ignorados),
    limpia y elimina los segmentos marcados para obtener texto final.
    """
    chunks = group_chunk.get('chunks', [])
    final_text = []
    for ch in chunks:
        t = ch.get('type')
        if t in ('attestation', 'marginalia'):
            continue
        md = ch.get('markdown', '') or ''
        final_text.append(clean_chunk_text(md))
    raw_text = "".join(final_text)

    # (Opcional) si quieres extraer texto entre marcadores para logging/diagnóstico:
    # figuras = extraer_texto_entre_marcadores(raw_text)
    # print("------- TEXTO FIGURAS ----------")
    # print(figuras)

    # Texto "limpio" para enviar al parser (sin marcadores):
    return eliminar_texto_entre_marcadores(raw_text)

def safe_parse(text: str, retries: int = 0, max_retries: int = 2):
    """
    Envuelve parser.parse_chunk con reintentos y backoff exponencial.
    Devuelve (ok, data|error_dict)
    """
    attempt = 0
    while True:
        try:
            out = parser.parse_chunk(text)
            return True, out
        except Exception as e:
            attempt += 1
            if attempt > max_retries:
                return False, {
                    "error": f"{type(e).__name__}: {str(e)}",
                    "traceback": traceback.format_exc()
                }
            # Backoff exponencial simple
            sleep(backoff_base_sec ** (attempt - 1))

# =========================================================
# Preparación de entradas
# =========================================================
inputs: List[Dict[str, Any]] = []
for i, group_chunk in enumerate(total_grouped_chunks[start_idx:final_num], start_num):
    try:
        final_text = preparar_texto(group_chunk)
        inputs.append({"i": i, "payload": final_text})
    except Exception as e:
        # Si falla la preparación, registramos para errores (no se envía al LLM)
        # Igual podemos seguir con las demás entradas
        pass

# =========================================================
# Ejecución por oleadas (sub-batches) con concurrencia
# =========================================================
os.makedirs(OUT_DIR, exist_ok=True)
results: List[Any] = [None] * len(inputs)
error_groups: List[Dict[str, Any]] = []

t0_global = time.perf_counter()
total_items = len(inputs)
remaining = total_items

with tqdm(total=total_items, desc="Parseando (oleadas)", unit="grp") as pbar:
    base = 0
    for sub in chunked(inputs, subbatch_size):
        # Ejecutamos esta oleada concurrente
        futures = {}
        t_oleada = time.perf_counter()
        with ThreadPoolExecutor(max_workers=max_workers) as ex:
            for j, item in enumerate(sub):
                idx_global = base + j
                futures[ex.submit(safe_parse, item["payload"], 0, max_retries)] = (idx_global, item["i"])

            for future in as_completed(futures):
                idx_global, group_number = futures[future]
                try:
                    ok, data = future.result()
                    if ok:
                        results[idx_global] = data
                    else:
                        error_groups.append({
                            "group_number": group_number,
                            "error": data["error"],
                            "traceback": data.get("traceback", "")
                        })
                except Exception as e:
                    # Error inesperado al obtener el futuro
                    error_groups.append({
                        "group_number": group_number,
                        "error": f"FUTURE_FAILURE: {type(e).__name__}: {str(e)}",
                        "traceback": traceback.format_exc()
                    })
                finally:
                    pbar.update(1)

        # ETA/metrics por oleada (opcional)
        iter_sec = time.perf_counter() - t_oleada
        done = min(base + len(sub), total_items)
        elapsed = time.perf_counter() - t0_global
        avg = elapsed / max(1, done)
        remaining_sec = avg * (total_items - done)
        eta = datetime.timedelta(seconds=max(0, int(remaining_sec)))
        pbar.set_postfix(oleada_s=f"{iter_sec:.2f}", avg_s=f"{avg:.2f}", eta=str(eta))

        base += len(sub)

elapsed = time.perf_counter() - t0_global
print(f"Tiempo total: {datetime.timedelta(seconds=int(elapsed))}")

# =========================================================
# Guardado de resultados
# =========================================================
ok_results = [r for r in results if r is not None]
with open(OUT_OK, "w", encoding="utf-8") as f:
    json.dump(ok_results, f, indent=2, ensure_ascii=False)

with open(OUT_ERR, "w", encoding="utf-8") as f:
    json.dump(error_groups, f, indent=2, ensure_ascii=False)

print(f"OK: {len(ok_results)} | Errores: {len(error_groups)}")
print(f"Guardados:\n- {OUT_OK}\n- {OUT_ERR}")


## Merge the Catalogues Results in one file

In [None]:
import os, json, glob

IN_DIR = "results/parsed_catalogues"
OUT_ALL_OK  = os.path.join(IN_DIR, "scott_parse_results_ALL.json")
OUT_ALL_ERR = os.path.join(IN_DIR, "scott_parse_errors_ALL.json")

def load_json_list(path):
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)
        return data if isinstance(data, list) else []

def save_json(path, data):
    with open(path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2, ensure_ascii=False)

# --- Merge RESULTS ---
all_results = []
result_files = sorted(glob.glob(os.path.join(IN_DIR, "scott_parse_results_*.json")))
result_files = [p for p in result_files if not p.endswith("_ALL.json")]

for p in result_files:
    all_results.extend(load_json_list(p))

save_json(OUT_ALL_OK, all_results)
print(f"Resultados unidos: {len(all_results)} -> {OUT_ALL_OK}")

# --- Merge ERRORS ---
all_errors = []
error_files = sorted(glob.glob(os.path.join(IN_DIR, "scott_parse_errors_*.json")))
error_files = [p for p in error_files if not p.endswith("_ALL.json")]

for p in error_files:
    all_errors.extend(load_json_list(p))

save_json(OUT_ALL_ERR, all_errors)
print(f"Errores unidos: {len(all_errors)} -> {OUT_ALL_ERR}")


### Print Results of Parsed Catalogues

In [None]:
from pathlib import Path
# Ruta al archivo unido
PATH = Path("results/parsed_catalogues/mena_parse_results_ALL.json")

# Cargar
with PATH.open("r", encoding="utf-8") as f:
    items = json.load(f)

# Conteo
print(f"Total de items: {len(items)}")

# Iterar (imprime un resumen por cada elemento)
for i, item in enumerate(items, start=1):
    issue_id = (item.get("issue_data") or {}).get("issue_id")
    title    = (item.get("issue_data") or {}).get("title")
    print(f"[{i}] issue_id={issue_id or '—'} | title={title or '—'}")

In [None]:
from pathlib import Path
# Ruta al archivo unido
PATH = Path("results/parsed_catalogues/scott_parse_results_ALL.json")

# Cargar
with PATH.open("r", encoding="utf-8") as f:
    items = json.load(f)

# Conteo
print(f"Total de items: {len(items)}")

# Iterar (imprime un resumen por cada elemento)
for i, item in enumerate(items, start=1):
    stamps = (item.get("stamps") or [])
    for stamp in stamps:
        print(stamp.get("scott_number"))