# Mena to Scott Catalog Matcher

**Fixed and Complete Version**

Matches stamps from Mena catalog to Scott catalog using multi-signal scoring.

## Key Features:
- ✅ Multi-signal scoring (denomination, color, year, perforation)
- ✅ Handles nested Scott data structure
- ✅ Enriches variety stamps with base stamp data
- ✅ Normalizes abbreviations ("2 reales" ↔ "2r", "pl brn" ↔ "pale brown")
- ✅ Color family matching
- ✅ Confidence scoring with thresholds

## 1. Imports

In [None]:
from pathlib import Path
import json
from typing import Dict, List, Any, Optional
from dataclasses import dataclass
import re
from difflib import SequenceMatcher

## 2. Data Structures

In [None]:
@dataclass
class MatchResult:
    """Represents a match between Mena and Scott catalogs"""
    mena_catalog_no: str
    scott_number: str
    confidence: str
    score: float
    signals: Dict[str, float]
    breakdown: str
    boost_reasons: List[str]
    requires_review: bool


@dataclass
class UnmatchedEntry:
    """Represents an unmatched catalog entry"""
    catalog_no: str
    denomination: str
    color: str
    reason: str

## 3. Normalization Dictionaries and Functions

In [None]:
# Color abbreviation mappings
COLOR_ABBREVIATIONS = {
    "pl brn": "pale brown", "dk brn": "dark brown", "lt bl": "light blue",
    "dk bl": "dark blue", "org": "orange", "grn": "green", "dk grn": "dark green",
    "lt grn": "light green", "yel": "yellow", "blk": "black", "scar": "scarlet",
    "car": "carmine", "vio": "violet", "pur": "purple", "brn": "brown",
    "ol": "olive", "org red": "orange red", "red brn": "red brown", "gray": "grey",
}

# Color family groupings
COLOR_FAMILIES = {
    "blue_family": ["blue", "light blue", "dark blue", "pale blue", "ultramarine", 
                    "blue violet", "pale gray violet", "gray violet"],
    "red_family": ["red", "scarlet", "carmine", "rose", "vermillion", "crimson",
                   "dark red", "rose red", "lake"],
    "yellow_family": ["yellow", "orange", "lemon", "gold", "amber", "yellow green"],
    "green_family": ["green", "light green", "dark green", "olive", "emerald", 
                     "yellow green", "dark green"],
    "brown_family": ["brown", "pale brown", "dark brown", "sepia", "chocolate", 
                     "red brown", "dark brown"],
}


def normalize_color(color_string: str) -> str:
    """Normalize color strings to standard format"""
    if not color_string:
        return ""
    color_lower = color_string.lower().strip()
    if color_lower in COLOR_ABBREVIATIONS:
        return COLOR_ABBREVIATIONS[color_lower]
    return " ".join(color_lower.split())

def clean_scott_color(color_string: str) -> str:
    """
    Remove overprint notation suffixes from Scott color strings.
    
    Examples:
        "carmine (Bk)" → "carmine"
        "green (R)" → "green"
        "blue vio (R)" → "blue vio"
    """
    if not color_string:
        return ""
    
    # Remove overprint suffixes: (R), (Bk), (BI), (G), (V), etc.
    cleaned = re.sub(r'\s*\([A-Z][a-z]?\)$', '', color_string)
    
    return cleaned.strip()

def find_color_family(color: str) -> Optional[str]:
    """Find which color family a color belongs to"""
    color_normalized = normalize_color(color)
    for family, colors in COLOR_FAMILIES.items():
        if color_normalized in colors:
            return family
    return None


def calculate_color_family_similarity(color1: str, color2: str) -> float:
    """Calculate similarity between two colors based on color families"""
    norm1 = normalize_color(color1)
    norm2 = normalize_color(color2)
    if norm1 == norm2:
        return 1.0
    family1 = find_color_family(norm1)
    family2 = find_color_family(norm2)
    
    # CRITICAL FIX: Different color families should have LOW similarity
    if family1 and family2:
        if family1 == family2:
            return 0.85  # Same family (e.g., light blue vs dark blue)
        else:
            return 0.3   # Different families (e.g., brown vs green) - LOWERED!
    
    return SequenceMatcher(None, norm1, norm2).ratio()

In [None]:
def parse_denomination_string(denom_string: str) -> Dict[str, Any]:
    """
    Parse Scott denomination strings including surcharges.
    
    Examples:
        '½r' → {"value": 0.5, "unit": "real"}
        '2r' → {"value": 2, "unit": "real"}
        '1c on 20c' → {"value": 1, "unit": "c", "surcharge": {"on_value": 20, "on_unit": "c"}}
    """
    if not denom_string:
        return {"value": None, "unit": None}
    
    denom_string = denom_string.lower().strip()
    
    # Check if it's a surcharge (contains "on")
    if " on " in denom_string:
        parts = denom_string.split(" on ")
        if len(parts) == 2:
            # Parse the new denomination (first part)
            new_denom = parse_simple_denomination(parts[0].strip())
            # Parse the original denomination (second part)
            orig_denom = parse_simple_denomination(parts[1].strip())
            
            return {
                "value": new_denom["value"],
                "unit": new_denom["unit"],
                "surcharge": {
                    "on_value": orig_denom["value"],
                    "on_unit": orig_denom["unit"]
                }
            }
    
    # Not a surcharge, parse normally
    return parse_simple_denomination(denom_string)


def fix_scott_surcharge_data(scott_stamp: Dict[str, Any]) -> Dict[str, Any]:
    """
    Fix Scott stamps where surcharge info is incorrectly split between denomination and color.
    
    Example:
        Input:  {denomination: "1c", color: "on ½r ('82)"}
        Output: {denomination: "1c on ½r", color: "vermilion (assumed)"}
    """
    denom = str(scott_stamp.get('denomination', '')).strip()
    color = str(scott_stamp.get('color', '')).strip()
    
    # Check if color field contains surcharge info (starts with "on")
    if color.lower().startswith('on '):
        # Reconstruct full denomination
        # Remove year markers like ('82) from color field
        surcharge_part = re.sub(r'\s*\([\'"]?\d{2}\).*$', '', color)
        full_denomination = f"{denom} {surcharge_part}"
        
        # Try to extract actual color from notes or illustration reference
        # For now, mark as unknown
        fixed_color = "surcharge color unknown"
        
        return {
            **scott_stamp,
            'denomination': full_denomination,
            'color': fixed_color,
            'original_color_field': color  # Keep for reference
        }
    
    return scott_stamp

def parse_simple_denomination(denom_string: str) -> Dict[str, Any]:
    """Parse a simple denomination string (helper function)"""
    if not denom_string:
        return {"value": None, "unit": None}
    
    denom_string = denom_string.strip()
    
    # Handle ½
    if "½" in denom_string:
        value = 0.5
        unit = re.sub(r'[½\d\s.]', '', denom_string)
    else:
        match = re.search(r'(\d+\.?\d*)', denom_string)
        if match:
            value = float(match.group(1))
        else:
            return {"value": None, "unit": None}
        unit = re.sub(r'[\d\s.]', '', denom_string)
    
    # Normalize unit
    unit = unit.strip()
    if unit == 'r':
        unit = 'real'
    elif unit == 'p':
        unit = 'peso'
    elif unit in ['c', 'ct', 'cts']:
        unit = 'centavo'
    
    return {"value": value, "unit": unit}


def normalize_denomination(value: float, unit: str) -> Dict[str, Any]:
    """Normalize Mena denomination to match Scott format"""
    unit_normalized = unit.lower().strip()
    
    # Remove plural 's' - CRITICAL FIX!
    if unit_normalized.endswith('es'):
        unit_normalized = unit_normalized[:-2]  # "reales" -> "real"
    elif unit_normalized.endswith('s'):
        unit_normalized = unit_normalized[:-1]  # "centavos" -> "centavo"
    
    # Handle special abbreviations
    if unit_normalized in ['p', 'ps']:
        unit_normalized = 'peso'
    elif unit_normalized in ['r']:
        unit_normalized = 'real'
    elif unit_normalized in ['c', 'ct']:
        unit_normalized = 'centavo'
    
    return {"value": value, "unit": unit_normalized}

## 4. Year Extraction Functions

In [None]:
def extract_primary_year(issue_dates: Dict[str, Any]) -> Optional[int]:
    """Extract the primary year from Mena issue dates"""
    date_priorities = ['placed_on_sale', 'probable_first_circulation', 'announced']
    for date_key in date_priorities:
        if date_key in issue_dates and issue_dates[date_key]:
            match = re.search(r'(\d{4})', str(issue_dates[date_key]))
            if match:
                return int(match.group(1))
    return None


def extract_scott_year(scott_stamp: Dict[str, Any]) -> Optional[int]:
    """Extract year from Scott stamp entry"""
    if 'year' in scott_stamp and scott_stamp['year']:
        return int(scott_stamp['year'])
    if 'header' in scott_stamp and scott_stamp['header']:
        match = re.search(r'(\d{4})', str(scott_stamp['header']))
        if match:
            return int(match.group(1))
    return None

## 5. Scott Data Preprocessing

This section handles:
1. Flattening nested Scott catalog structure
2. Enriching variety stamps with base stamp data
3. Adding year information from headers

In [None]:
def enrich_variety_stamps(scott_stamps: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """
    Enrich variety stamps by inheriting data from base stamps IN THE SAME ISSUE.
    
    CRITICAL FIX: Uses (scott_number + year + header) as key to avoid cross-issue contamination.
    """
    # Build base stamp lookup with issue context
    # Key: (scott_number, year, header) to ensure we match within same issue
    base_stamps = {}
    for stamp in scott_stamps:
        if 'variety_of' not in stamp or not stamp.get('variety_of'):
            scott_no = stamp.get('scott_number', '')
            year = stamp.get('year')
            header = stamp.get('header', '')
            key = (scott_no, year, header)
            base_stamps[key] = stamp
    
    # Enrich varieties
    enriched = []
    for stamp in scott_stamps:
        stamp_copy = stamp.copy()
        
        if 'variety_of' in stamp and stamp['variety_of']:
            base_no = stamp['variety_of']
            year = stamp.get('year')
            header = stamp.get('header', '')
            
            # Look for base stamp in SAME issue
            key = (base_no, year, header)
            base_stamp = base_stamps.get(key)
            
            if base_stamp:
                # Inherit denomination if missing
                if not stamp.get('denomination') and base_stamp.get('denomination'):
                    stamp_copy['denomination'] = base_stamp['denomination']
                
                # Try to extract color from description first
                if not stamp.get('color') and stamp.get('description'):
                    desc = stamp['description'].lower()
                    color_keywords = [
                        'light blue', 'dark blue', 'pale blue', 'blue',
                        'light green', 'dark green', 'pale green', 'green',
                        'light brown', 'dark brown', 'pale brown', 'brown',
                        'light violet', 'dark violet', 'pale violet', 'violet',
                        'blue violet', 'gray violet', 'pale gray violet',
                        'scarlet', 'red', 'carmine', 'rose', 'vermillion',
                        'yellow', 'orange', 'lemon', 'gold',
                        'black', 'purple', 'gray', 'grey'
                    ]
                    for color in color_keywords:
                        if color in desc:
                            stamp_copy['color'] = color
                            break
                
                # If still no color, inherit from base
                if not stamp_copy.get('color') and base_stamp.get('color'):
                    stamp_copy['color'] = base_stamp['color']
                
                # Inherit perforation if missing
                if not stamp_copy.get('perforation') and base_stamp.get('perforation'):
                    stamp_copy['perforation'] = base_stamp['perforation']
            else:
                # Fallback: try to find ANY base stamp with that number (less ideal)
                for (num, _, _), base_stamp in base_stamps.items():
                    if num == base_no:
                        if not stamp_copy.get('denomination') and base_stamp.get('denomination'):
                            stamp_copy['denomination'] = base_stamp['denomination']
                        if not stamp_copy.get('color') and base_stamp.get('color'):
                            stamp_copy['color'] = base_stamp['color']
                        break
        
        enriched.append(stamp_copy)
    
    return enriched

def strip_leading_zeros(catalog_no: str) -> str:
    """
    Strip leading zeros from regular stamps, but convert Scott's "0X" notation to "OX" (Official).
    
    Scott catalog convention:
    - "01", "02", "022" → Official stamps (convert to "O1", "O2", "O22")
    - "001", "0001" → Regular stamps with leading zeros (strip to "1")
    - "C01" → Keep as-is (already has letter prefix)
    
    Examples:
        "01" → "O1" (Official #1)
        "022" → "O22" (Official #22)
        "001" → "1" (Regular stamp with extra zeros)
        "21" → "21" (no change)
        "C01" → "C01" (no change)
    """
    catalog_no = str(catalog_no).strip()
    
    # If already has letter prefix, keep as-is
    if re.match(r'^[A-Za-z]', catalog_no):
        return catalog_no
    
    # Scott notation: "0" followed by 1-2 digits = Official stamps
    # Examples: "01" → "O1", "02" → "O2", "022" → "O22"
    if re.match(r'^0\d{1,2}$', catalog_no):
        return 'O' + catalog_no[1:]  # Replace leading 0 with O
    
    # Regular leading zeros (3+ digits starting with 0)
    # Examples: "001" → "1", "0021" → "21"
    if catalog_no.startswith('0'):
        return catalog_no.lstrip('0') or '0'
    
    return catalog_no

def flatten_and_enrich_scott_data(scott_grouped_data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """Flatten Scott data from grouped structure and enrich variety stamps."""
    flat_stamps = []
    
    for group in scott_grouped_data:
        header = group.get('header', '')
        stamps = group.get('stamps', [])
        
        for stamp in stamps:
            stamp_copy = stamp.copy()
            stamp_copy['header'] = header
            
            # Normalize scott_number
            scott_no = stamp_copy.get('scott_number', '')
            stamp_copy['scott_number'] = strip_leading_zeros(scott_no)
            
            # Extract year from header
            if header:
                match = re.search(r'(\d{4})', str(header))
                if match:
                    stamp_copy['year'] = int(match.group(1))
            
            # CRITICAL FIX: Reconstruct surcharge denominations
            stamp_copy = fix_scott_surcharge_data(stamp_copy)
            
            flat_stamps.append(stamp_copy)
    
    # Enrich varieties
    enriched_stamps = enrich_variety_stamps(flat_stamps)
    
    return enriched_stamps

## 6. Matching Logic

In [None]:
def build_candidate_pool(mena_issue: Dict[str, Any], 
                         all_scott_stamps: List[Dict[str, Any]], 
                         year_tolerance: int = 2) -> List[Dict[str, Any]]:
    """Build a pool of Scott stamp candidates based on year"""
    primary_year = extract_primary_year(mena_issue['issue_data']['issue_dates'])
    if not primary_year:
        print("WARNING: No primary year found for Mena issue, returning all stamps")
        return all_scott_stamps
    
    candidates = []
    no_year_count = 0
    
    for scott_stamp in all_scott_stamps:
        scott_year = extract_scott_year(scott_stamp)
        
        if scott_year is not None:
            if abs(scott_year - primary_year) <= year_tolerance:
                candidates.append(scott_stamp)
        else:
            no_year_count += 1
    
    print(f"Found {len(candidates)} Scott candidates for year {primary_year} (±{year_tolerance} years)")
    print(f"Excluded {no_year_count} stamps without year information")
    
    # Year distribution
    year_counts = {}
    for c in candidates:
        y = extract_scott_year(c)
        year_counts[y] = year_counts.get(y, 0) + 1
    
    print(f"Year distribution: {dict(sorted(year_counts.items()))}")
    
        
    print(f"\nAll candidates overall:")
    for c in candidates:
        print(f"  Scott #{c.get('scott_number')}: {c.get('denomination')} {c.get('color')} (year={extract_scott_year(c)})")
    
    return candidates

In [None]:
def normalize_catalog_number(catalog_no: str) -> tuple:
    """
    Normalize catalog number to (category, number, suffix) for matching and sorting.
    
    CRITICAL: Only treat letter "O" as Official, NOT digit "0" (which is just formatting).
    
    Examples:
        "21" → ("", 21.0, "")
        "021" → ("", 21.0, "")  # Leading zero is just formatting
        "O21" → ("O", 21.0, "")  # Letter O means Official
        "C164" → ("C", 164.0, "")
    """
    catalog_no = str(catalog_no).strip()
    
    # Extract category prefix (ONLY letters at start, NOT digits)
    category_match = re.match(r'^([A-Za-z]+)', catalog_no)
    if category_match:
        category = category_match.group(1).upper()
        remaining = catalog_no[len(category):]
    else:
        category = ""  # Regular issue (no letter prefix)
        remaining = catalog_no
    
    # Strip leading zeros from numeric part (for sorting only)
    remaining = remaining.lstrip('0') or '0'
    
    # Extract numeric part
    number_match = re.match(r'^(\d+)', remaining)
    if number_match:
        base_num = float(number_match.group(1))
        remaining = remaining[len(number_match.group(1)):]
        
        # Extract suffix (letters after number)
        suffix_match = re.match(r'^([a-z]+)', remaining, re.IGNORECASE)
        if suffix_match:
            suffix = suffix_match.group(1).lower()
            # Convert suffix to decimal for sorting (a=0.1, b=0.2, etc.)
            for i, char in enumerate(suffix):
                base_num += (ord(char) - ord('a') + 1) * (0.1 ** (i + 1))
        else:
            suffix = ""
    else:
        base_num = 999999.0
        suffix = ""
    
    return (category, base_num, suffix)



def make_scott_unique_key(scott_stamp: Dict[str, Any]) -> str:
    """
    Create a unique key for Scott stamps that handles duplicate numbers across years.
    
    Format: "number__year" (e.g., "1__1883", "7__1881")
    """
    scott_no = scott_stamp.get('scott_number', 'UNKNOWN')
    year = extract_scott_year(scott_stamp)
    
    if year:
        return f"{scott_no}__{year}"
    else:
        return scott_no

def get_stamp_category(stamp: Dict[str, Any], is_mena: bool = True) -> str:
    """
    Get the true category of a stamp, considering both catalog prefix and section/type.
    
    For Mena: Use section field as source of truth
    For Scott: Use catalog number prefix
    """
    if is_mena:
        # For Mena, section field is authoritative
        section = stamp.get('issue_data', {}).get('section', '') if 'issue_data' in stamp else ''
        section = section.lower().strip()
        
        # Map Mena sections to categories
        section_to_category = {
            'surface mail': '',  # Regular
            'airmail': 'C',
            'air mail': 'C',
            'official': 'O',
            'telegraph': 'T',
            'telegraphs': 'T',
            'postage due': 'J',
            'dues': 'J',
            'special delivery': 'E',
            'registration': 'F',
            'guanacaste': 'G',
        }
        
        for key, cat in section_to_category.items():
            if key in section:
                return cat
        
        # Fallback to catalog number prefix
        return normalize_catalog_number(stamp.get('catalog_no', ''))[0]
    
    else:
        # For Scott, use catalog number prefix
        return normalize_catalog_number(stamp.get('scott_number', ''))[0]

# Category priority for sorting (regular issues first, then alphabetically)
CATEGORY_PRIORITY = {
    "": 0,      # Regular issues
    "A": 1,     # Airmail (Mena)
    "AR": 2,    # Postal Fiscal (Scott)
    "B": 3,     # Semi-Postal
    "C": 4,     # Airmail (Scott) / Christmas Postal Tax (Mena)
    "CE": 5,    # Air post special delivery
    "CO": 6,    # Air mail official
    "CT": 7,    # Christmas Postal Tax
    "D": 8,     # Dues
    "E": 9,     # Essay (Mena) / Special Delivery (Scott)
    "EN": 10,   # Envelope
    "G": 11,    # Guanacaste
    "J": 12,    # Postage Due (Scott)
    "O": 13,    # Official
    "OA": 14,   # Official Airmail
    "PC": 15,   # Postal Card
    "PR": 16,   # Postal Revenue
    "PS": 17,   # Postal Seal
    "R": 18,    # Revenue
    "RA": 19,   # Postal Tax (Scott)
    "RL": 20,   # Registration Label
    "RS": 21,   # Radiograph Seal
    "SD": 22,   # Special Delivery (Mena)
    "SP": 23,   # Semi-postal / Surcharge Proof
    "SS": 24,   # Souvenir Sheet
    "T": 25,    # Telegraphs
    "TR": 26,   # Telegraph Revenue
    "TS": 27,   # Telegraph Seals
    "W": 28,    # Wrapper
}

def get_category_priority(category: str) -> int:
    """Get sorting priority for a category"""
    return CATEGORY_PRIORITY.get(category.upper(), 999)

In [None]:
def calculate_match_score(mena_stamp: Dict[str, Any], 
                         scott_stamp: Dict[str, Any], 
                         mena_issue_context: Dict[str, Any]) -> Dict[str, Any]:
    """Calculate match score using multiple signals (100-point system)"""
    signals = {}
    total_score = 0.0
    breakdown_parts = []
    
    # ========== PRE-CHECK: CATEGORY MATCH (10 points, -50 if wrong) ==========
    mena_category = get_stamp_category(
        {'issue_data': mena_issue_context, 'catalog_no': mena_stamp.get('catalog_no', '')}, 
        is_mena=True
    )
    scott_category = get_stamp_category(scott_stamp, is_mena=False)
    
    # Define category equivalences
    category_compatible = False
    if mena_category == scott_category:
        category_compatible = True
    else:
        equivalences = [
            ({'', 'C'}, {'', 'C'}),
            ({'O'}, {'O', 'CO'}),
            ({'E'}, {'E', 'CE'}),
            ({'J'}, {'J'}),
            ({'G'}, {'G'}),
        ]
        for mena_set, scott_set in equivalences:
            if mena_category in mena_set and scott_category in scott_set:
                category_compatible = True
                break
    
    if not category_compatible:
        signals['category'] = -50
        total_score -= 50
        breakdown_parts.append(f"Cat: ✗ (M:{mena_category or 'REG'} vs S:{scott_category or 'REG'})")
    else:
        signals['category'] = 10
        total_score += 10
        breakdown_parts.append("Cat: ✓")
    
    # ========== SIGNAL 1: DENOMINATION (35 points) ==========
    mena_denom = normalize_denomination(
        mena_stamp['denomination']['value'], 
        mena_stamp['denomination']['unit']
    )
    scott_denom = parse_denomination_string(scott_stamp.get('denomination', ''))
    
    # Check surcharge status
    mena_has_surcharge = (mena_stamp.get('overprint', {}).get('present') and 
                          mena_stamp.get('overprint', {}).get('type') == 'surcharge')
    scott_has_surcharge = 'surcharge' in scott_denom
    
    # Surcharge mismatch penalty (ONLY if one has and other doesn't)
    if mena_has_surcharge != scott_has_surcharge:
        signals['surcharge_mismatch'] = -10
        total_score -= 10
        breakdown_parts.append("Surcharge: ✗")
    
    # Denomination matching
    if mena_has_surcharge and scott_has_surcharge:
        # Both are surcharges - check BOTH values
        new_match = (mena_denom['value'] == scott_denom['value'] and 
                     mena_denom['unit'] == scott_denom['unit'])
        
        mena_orig = normalize_denomination(
            mena_stamp['overprint']['on_denomination']['value'],
            mena_stamp['overprint']['on_denomination']['unit']
        )
        scott_orig = scott_denom['surcharge']
        orig_match = (mena_orig['value'] == scott_orig['on_value'] and 
                      mena_orig['unit'] == scott_orig['on_unit'])
        
        if new_match and orig_match:
            signals['denomination'] = 35
            total_score += 35
            breakdown_parts.append("Denom: ✓")
        elif new_match:
            signals['denomination'] = 20
            total_score += 20
            breakdown_parts.append("Denom: ⚠️")
        else:
            signals['denomination'] = 0
            breakdown_parts.append("Denom: ✗")
    
    elif (mena_denom['value'] == scott_denom['value'] and 
          mena_denom['unit'] == scott_denom['unit']):
        signals['denomination'] = 35
        total_score += 35
        breakdown_parts.append("Denom: ✓")
    else:
        signals['denomination'] = 0
        breakdown_parts.append("Denom: ✗")
    
    # ========== SIGNAL 2: COLOR (30 points) ==========
    if mena_stamp.get('color') and scott_stamp.get('color'):
        mena_color_raw = mena_stamp['color']
        scott_color_raw = scott_stamp['color']
        
        # CRITICAL: Clean Scott color (remove overprint suffixes)
        scott_color = clean_scott_color(scott_color_raw)
        
        # Handle compound/variant colors
        mena_colors = []
        if '/' in mena_color_raw:
            mena_colors = [c.strip() for c in mena_color_raw.split('/')]
        elif ' & ' in mena_color_raw:
            mena_colors = [c.strip() for c in mena_color_raw.split('&')]
        else:
            mena_colors = [mena_color_raw]
        
        # Try each variant, take BEST match
        best_similarity = 0.0
        best_mena_color = mena_colors[0]
        
        for mena_color in mena_colors:
            similarity = calculate_color_family_similarity(mena_color, scott_color)
            if similarity > best_similarity:
                best_similarity = similarity
                best_mena_color = mena_color
        
        color_score = best_similarity * 30
        signals['color'] = color_score
        total_score += color_score
        
        if best_similarity >= 0.85:
            breakdown_parts.append(f"Color: {int(best_similarity*100)}%")
        else:
            breakdown_parts.append(f"Color: {int(best_similarity*100)}%")
    else:
        signals['color'] = 0
    
    # ========== SIGNAL 3: YEAR (25 points) ==========
    mena_year = extract_primary_year(mena_issue_context['issue_dates'])
    scott_year = extract_scott_year(scott_stamp)
    if mena_year and scott_year:
        year_diff = abs(mena_year - scott_year)
        if year_diff == 0:
            signals['year'] = 25  # Increased from 20
            total_score += 25
            breakdown_parts.append("Year: ✓")
        elif year_diff == 1:
            signals['year'] = 15
            total_score += 15
            breakdown_parts.append("Year: ~1")
        elif year_diff == 2:
            signals['year'] = 10  # Decreased from 15
            total_score += 10
            breakdown_parts.append("Year: ~2")
    
    # ========== SIGNAL 4: PERFORATION (10 points) ==========
    mena_perf = str(mena_stamp.get('perforation', '')).strip()
    scott_perf = str(scott_stamp.get('perforation', '')).strip()
    
    if mena_perf and scott_perf:
        mena_perf_num = re.findall(r'[\d.]+', mena_perf)
        scott_perf_num = re.findall(r'[\d.]+', scott_perf)
        
        if mena_perf_num and scott_perf_num:
            if any(m == s for m in mena_perf_num for s in scott_perf_num):
                signals['perforation'] = 10
                total_score += 10
            else:
                signals['perforation'] = -5
                total_score -= 5
                breakdown_parts.append(f"Perf: ✗")
    
    return {
        'total_score': total_score, 
        'signals': signals, 
        'breakdown': " | ".join(breakdown_parts)
    }

In [None]:
def score_all_candidates(mena_issue: Dict[str, Any], 
                        scott_candidate_pool: List[Dict[str, Any]], 
                        min_threshold: float = 30.0) -> List[Dict[str, Any]]:
    """Score all Mena stamps against all Scott candidates"""
    scoring_matrix = []
    
    for mena_stamp in mena_issue['stamps']:
        mena_row = {
            'mena_catalog_no': mena_stamp['catalog_no'], 
            'candidates': []
        }
        
        for scott_candidate in scott_candidate_pool:
            score_result = calculate_match_score(
                mena_stamp, scott_candidate, mena_issue['issue_data']
            )
            
            if score_result['total_score'] >= min_threshold:
                unique_key = make_scott_unique_key(scott_candidate)
                
                mena_row['candidates'].append({
                    'scott_number': scott_candidate.get('scott_number', 'UNKNOWN'),
                    'scott_unique_key': unique_key,
                    'scott_year': extract_scott_year(scott_candidate),
                    'score': score_result['total_score'],
                    'signals': score_result['signals'],
                    'breakdown': score_result['breakdown']
                })
        
        if mena_row['candidates']:
            scoring_matrix.append(mena_row)
            
            # # DEBUG: Show scores for Mena #13
            # if mena_stamp['catalog_no'] == '13':
            #     print(f"\n[DEBUG] All candidates for Mena #13 (2c):")
            #     for cand in sorted(mena_row['candidates'], key=lambda x: -x['score'])[:10]:
            #         print(f"  Scott #{cand['scott_number']} ({cand['scott_year']}): {cand['score']:.1f} - {cand['breakdown']}")
    
    return scoring_matrix

In [None]:
def find_optimal_assignment(scoring_matrix: List[Dict[str, Any]]) -> List[MatchResult]:
    """Find optimal assignment using unique Scott keys"""
    from scipy.optimize import linear_sum_assignment
    import numpy as np
    
    mena_stamps = [row['mena_catalog_no'] for row in scoring_matrix]
    
    # CRITICAL: Use unique keys instead of just scott_number
    all_scott_keys = set()
    for row in scoring_matrix:
        for cand in row['candidates']:
            all_scott_keys.add(cand['scott_unique_key'])  # CHANGED
    
    scott_stamps = sorted(all_scott_keys)
    
    # print(f"\n[DEBUG find_optimal_assignment]")
    # print(f"  Mena stamps: {mena_stamps}")
    # print(f"  Unique Scott keys: {sorted(all_scott_keys)[:20]}")
    
    # Build cost matrix
    n_mena = len(mena_stamps)
    n_scott = len(scott_stamps)
    max_dim = max(n_mena, n_scott)
    
    cost_matrix = np.full((max_dim, max_dim), 1000.0)
    
    # CRITICAL: Use unique keys for lookup
    scott_to_idx = {scott_key: i for i, scott_key in enumerate(scott_stamps)}
    
    for i, row in enumerate(scoring_matrix):
        for cand in row['candidates']:
            scott_key = cand['scott_unique_key']  # CHANGED
            if scott_key in scott_to_idx:
                j = scott_to_idx[scott_key]
                cost_matrix[i, j] = -cand['score']
    
    # Find optimal assignment
    mena_indices, scott_indices = linear_sum_assignment(cost_matrix)
    
    # Build results
    assignments = []
    details_map = {}
    
    # CRITICAL: Build lookup with unique keys
    for row in scoring_matrix:
        for cand in row['candidates']:
            key = (row['mena_catalog_no'], cand['scott_unique_key'])  # CHANGED
            details_map[key] = cand
    
    for mena_idx, scott_idx in zip(mena_indices, scott_indices):
        if mena_idx >= n_mena or scott_idx >= n_scott:
            continue
        
        mena_no = mena_stamps[mena_idx]
        scott_key = scott_stamps[scott_idx]  # This is now "7__1883" format
        key = (mena_no, scott_key)
        
        if key not in details_map:
            continue
        
        cand = details_map[key]
        score = cand['score']
        
        if score < 30:
            continue
        
        confidence = "HIGH" if score >= 70 else "MEDIUM" if score >= 50 else "LOW"
        requires_review = score < 70
        
        # Display with year for clarity
        scott_display = f"{cand['scott_number']} ({cand.get('scott_year', '?')})"
        
        assignments.append(MatchResult(
            mena_catalog_no=mena_no,
            scott_number=scott_display,  # CHANGED to show year
            confidence=confidence,
            score=score,
            signals=cand['signals'],
            breakdown=cand['breakdown'],
            boost_reasons=[],
            requires_review=requires_review
        ))
    
    assignments.sort(key=lambda x: normalize_catalog_number(x.mena_catalog_no)[1])
    
    return assignments

def extract_numeric_prefix(catalog_no: str) -> float:
    """
    Extract numeric prefix from catalog number for sorting.
    
    Examples:
        "17" → 17.0
        "17a" → 17.1
        "21" → 21.0
        "22" → 22.0
        "C164" → 164.0 (strips letter prefix)
    """
    # Remove letter prefixes (like "C" in "C164")
    no_prefix = re.sub(r'^[A-Z]+', '', catalog_no)
    
    # Extract the numeric part
    match = re.match(r'(\d+)', no_prefix)
    if match:
        base_num = float(match.group(1))
        
        # Add fractional part for suffixes (a=0.1, b=0.2, etc.)
        suffix_match = re.search(r'[a-z]', catalog_no.lower())
        if suffix_match:
            suffix = suffix_match.group(0)
            base_num += (ord(suffix) - ord('a') + 1) * 0.1
        
        return base_num
    
    # Fallback for non-standard formats
    return 999999.0

## 7. Main Matching Function

In [None]:
def match_mena_to_scott(mena_issue: Dict[str, Any], 
                       all_scott_stamps: List[Dict[str, Any]], 
                       year_tolerance: int = 2, 
                       min_score_threshold: float = 30.0) -> Dict[str, Any]:
    """Main function to match Mena issue to Scott catalog"""
    
    print("\n" + "="*80)
    print("MENA TO SCOTT CATALOG MATCHING")
    print("="*80)
    
    # Build candidate pool
    scott_candidates = build_candidate_pool(mena_issue, all_scott_stamps, year_tolerance)
    
    # # CRITICAL DEBUG: Check what Scott #1, #5, #7, #17, #19 actually are
    # suspect_numbers = ['1', '5', '7', '17', '19']

    # print("\n" + "="*80)
    # print("DEBUGGING: What are these Scott numbers in the candidate pool?")
    # print("="*80)

    # for suspect in suspect_numbers:
    #     matches = [s for s in scott_candidates if s.get('scott_number') == suspect]
    #     if matches:
    #         for s in matches:
    #             print(f"\nScott #{suspect}:")
    #             print(f"  Denomination: {s.get('denomination')}")
    #             print(f"  Color: {s.get('color')}")
    #             print(f"  Year: {extract_scott_year(s)}")
    #             print(f"  Header: {s.get('header', 'N/A')}")
    #             print(f"  Illustration: {s.get('illustration', 'N/A')}")
    #     else:
    #         print(f"\nScott #{suspect}: NOT IN CANDIDATE POOL")

    # print("\n" + "="*80)
    # print("Now checking ALL Scott stamps (not just candidates):")
    # print("="*80)

    # for suspect in suspect_numbers:
    #     all_matches = [s for s in all_scott_stamps if s.get('scott_number') == suspect]
    #     print(f"\nScott #{suspect} appears {len(all_matches)} time(s) in full catalog:")
    #     for i, s in enumerate(all_matches[:3], 1):  # Show first 3
    #         print(f"  {i}. Year={extract_scott_year(s)}, Denom={s.get('denomination')}, Header={s.get('header', 'N/A')[:50]}")
    
    # Score all candidates
    scoring_matrix = score_all_candidates(mena_issue, scott_candidates, min_score_threshold)
    
    # Find optimal assignment
    assignments = find_optimal_assignment(scoring_matrix)
    
    # Calculate statistics
    statistics = {
        'total_mena_stamps': len(mena_issue['stamps']),
        'total_assignments': len(assignments),
        'high_confidence': sum(1 for a in assignments if a.confidence == "HIGH"),
        'medium_confidence': sum(1 for a in assignments if a.confidence == "MEDIUM"),
        'low_confidence': sum(1 for a in assignments if a.confidence == "LOW"),
        'success_rate': round(len(assignments) / len(mena_issue['stamps']) * 100, 1) 
                        if mena_issue['stamps'] else 0
    }
    
    # Build result
    result = {
        'issue_match': {
            'mena_issue_id': mena_issue['issue_data']['issue_id'],
            'mena_title': mena_issue['issue_data']['title'],
            'candidate_pool_size': len(scott_candidates)
        },
        'assignments': [
            {
                'mena_catalog_no': a.mena_catalog_no,
                'scott_number': a.scott_number,
                'confidence': a.confidence,
                'score': round(a.score, 1),
                'signals': {k: round(v, 1) for k, v in a.signals.items()},
                'breakdown': a.breakdown,
                'requires_review': a.requires_review
            }
            for a in assignments
        ],
        'statistics': statistics,
        'scoring_matrix': scoring_matrix
    }
    
    return result

## 8. Results Printing

In [None]:
def print_matching_results(result: Dict[str, Any]) -> None:
    """Pretty print the matching results"""
    print("\n" + "="*80)
    print("MATCHING RESULTS")
    print("="*80)
    
    for assignment in result['assignments']:
        print(f"\n✓ Mena #{assignment['mena_catalog_no']} → Scott #{assignment['scott_number']}")
        print(f"  Confidence: {assignment['confidence']} (Score: {assignment['score']}/100)")
        print(f"  {assignment['breakdown']}")
    
    print("\n" + "="*80)
    stats = result['statistics']
    print(f"Total: {stats['total_mena_stamps']} | Matched: {stats['total_assignments']} ({stats['success_rate']}%)")
    print(f"High: {stats['high_confidence']} | Medium: {stats['medium_confidence']} | Low: {stats['low_confidence']}")
    print("="*80 + "\n")

## 9. Load Your Data

**Replace these paths with your actual file paths!**

In [None]:
# Load Mena issue
PATH = Path("results/parsed_catalogues/mena_parse_results_ALL.json")

# Cargar
with PATH.open("r", encoding="utf-8") as f:
    mena_parsed_catalog = json.load(f)


In [81]:
mena_issue = mena_parsed_catalog[435]
print(f"Loaded Mena issue: {mena_issue['issue_data']['title']}")
print(f"Number of stamps: {len(mena_issue['stamps'])}")
print(mena_issue)

Loaded Mena issue: Overprint III Philatelic Exhibition issue
Number of stamps: 2
{'issue_data': {'issue_id': 'CR-1968-III-PHILATELIC-EXHIBITION-OVERPRINT', 'section': 'Surface Mail', 'title': 'Overprint III Philatelic Exhibition issue', 'country': 'Costa Rica', 'issue_dates': {'announced': '1968-08-01', 'placed_on_sale': None, 'probable_first_circulation': None, 'second_plate_sale': None, 'demonetized': None}, 'legal_basis': [{'type': 'resolution', 'id': 'Central Bank Resolution #15', 'date': '1968-07-18', 'ids': [], 'officials': []}], 'currency_context': {'original': 'C', 'decimal_adoption': None, 'revaluation_date': None, 'revaluation_map': {}}, 'printing': {'printer': 'Imprenta Nacional', 'process': ['typographed', 'overprint'], 'format': {'panes': None}, 'plates': {}}, 'perforation': ''}, 'production_orders': {'printings': [], 'remainders': {'date': None, 'note': '', 'quantities': []}}, 'stamps': [{'catalog_no': 'SSA497', 'issue_id': 'CR-1968-III-PHILATELIC-EXHIBITION-OVERPRINT', '

In [69]:
# Load Scott catalog (grouped structure)
PATH = Path("results/parsed_catalogues/scott_parse_results_ALL.json")

# Cargar
with PATH.open("r", encoding="utf-8") as f:
    scott_grouped = json.load(f)

print(f"Loaded Scott catalog: {len(scott_grouped)} issue groups")

Loaded Scott catalog: 1086 issue groups


In [70]:
# CRITICAL STEP: Flatten and enrich Scott data
all_scott_stamps = flatten_and_enrich_scott_data(scott_grouped)

print(f"Preprocessed: {len(all_scott_stamps)} total stamps")
print(f"\nExample enriched variety stamp (Scott #1a):")
for stamp in all_scott_stamps[:10]:
    if stamp.get('scott_number') == '1a' and stamp.get('year') == 1863:
        print(f"  denomination: {stamp.get('denomination')}")
        print(f"  color: {stamp.get('color')}")
        print(f"  variety_of: {stamp.get('variety_of')}")
        break

Preprocessed: 2559 total stamps

Example enriched variety stamp (Scott #1a):
  denomination: ½r
  color: light blue
  variety_of: 1


## 10. Run Matching

In [82]:
# Run the matching algorithm
result = match_mena_to_scott(
    mena_issue=mena_issue,
    all_scott_stamps=all_scott_stamps,
    year_tolerance=2,
    min_score_threshold=60.0
)

# Print results
print_matching_results(result)


MENA TO SCOTT CATALOG MATCHING
Found 108 Scott candidates for year 1968 (±2 years)
Excluded 366 stamps without year information
Year distribution: {1966: 14, 1967: 48, 1968: 9, 1969: 18, 1970: 19}

All candidates overall:
  Scott #267: 15c multicolored (year=1969)
  Scott #268: 35c multicolored (year=1969)
  Scott #269: 50c gray & multi (year=1969)
  Scott #270: 55c buff & multi (year=1969)
  Scott #271: 65c multicolored (year=1969)
  Scott #272: 1col pink & multi (year=1969)
  Scott #273: 2col multicolored (year=1969)
  Scott #C421: 5c black & red (year=1966)
  Scott #C422: 10c bister & red (year=1966)
  Scott #C423: 15c blk, red brn & red (year=1966)
  Scott #C424: 35c black & yel (year=1966)
  Scott #C425: 50c dk blue & red (year=1966)
  Scott #C426: 15c on 30c surcharge color unknown (year=1966)
  Scott #C427: 15c on 45c surcharge color unknown (year=1966)
  Scott #C428: 35c on 75c surcharge color unknown (year=1966)
  Scott #C429: 35c on 55c surcharge color unknown (year=1966)
  

In [87]:
scott_raw_candidates = build_candidate_pool(mena_issue, all_scott_stamps, 2)
scott_str_candidates = []
for c in scott_raw_candidates:
    scott_str_candidates.append(f"  Scott #{c.get('scott_number')}: {c.get('denomination')} {c.get('color')} (year={extract_scott_year(c)})")

Found 108 Scott candidates for year 1968 (±2 years)
Excluded 366 stamps without year information
Year distribution: {1966: 14, 1967: 48, 1968: 9, 1969: 18, 1970: 19}

All candidates overall:
  Scott #267: 15c multicolored (year=1969)
  Scott #268: 35c multicolored (year=1969)
  Scott #269: 50c gray & multi (year=1969)
  Scott #270: 55c buff & multi (year=1969)
  Scott #271: 65c multicolored (year=1969)
  Scott #272: 1col pink & multi (year=1969)
  Scott #273: 2col multicolored (year=1969)
  Scott #C421: 5c black & red (year=1966)
  Scott #C422: 10c bister & red (year=1966)
  Scott #C423: 15c blk, red brn & red (year=1966)
  Scott #C424: 35c black & yel (year=1966)
  Scott #C425: 50c dk blue & red (year=1966)
  Scott #C426: 15c on 30c surcharge color unknown (year=1966)
  Scott #C427: 15c on 45c surcharge color unknown (year=1966)
  Scott #C428: 35c on 75c surcharge color unknown (year=1966)
  Scott #C429: 35c on 55c surcharge color unknown (year=1966)
  Scott #C430: 50c on 85c surcharg

## 11. Save Results

In [None]:
# Save to JSON
output_file = "matching_results.json"
with open(output_file, 'w') as f:
    json.dump(result, f, indent=2)

print(f"✓ Results saved to: {output_file}")

## 12. Detailed Results Table

In [None]:
import pandas as pd

# Create DataFrame
df = pd.DataFrame([
    {
        'Mena #': a['mena_catalog_no'],
        'Scott #': a['scott_number'],
        'Score': a['score'],
        'Confidence': a['confidence'],
        'Denom': a['signals'].get('denomination', 0),
        'Color': a['signals'].get('color', 0),
        'Year': a['signals'].get('year', 0),
        'Perf': a['signals'].get('perforation', 0),
        'Review': '⚠️' if a['requires_review'] else '✓'
    }
    for a in result['assignments']
])

print("\n" + "="*80)
print("DETAILED MATCHING TABLE")
print("="*80)
print(df.to_string(index=False))
print("\nLegend: Denom=Denomination, Perf=Perforation")
print("="*80)

## Summary

### Key Fixes Applied:

1. ✅ **Scott Data Flattening** - Converts nested structure to flat list
2. ✅ **Variety Enrichment** - Inherits data from base stamps to varieties
3. ✅ **Denomination Normalization** - Handles "reales" → "real", "p" → "peso"
4. ✅ **Color Family Matching** - Recognizes "yellow" ≈ "orange" (85%)
5. ✅ **Year Extraction** - Pulls year from multiple date formats

### Expected Results:
- **Match Rate**: >90%
- **High Confidence**: >70%
- **Zero False Positives**

### Confidence Levels:
- **HIGH** (70-100): Very reliable, approve immediately
- **MEDIUM** (50-69): Likely correct, review recommended
- **LOW** (30-49): Uncertain, requires manual verification

In [None]:
def find_catalog_gaps_complete(all_scott_stamps: List[Dict[str, Any]], 
                               analyze_all: bool = False,
                               show_details: bool = False):
    """
    Find gaps in Scott catalog numbering with proper category handling.
    
    Costa Rica Scott Catalog Ranges (approximate):
    - Regular: 1-733
    - Airmail (C): C1-C940
    - Official (O): O1-O75
    - Guanacaste (G): G1-G70
    - Postage Due (J): J1-J50
    - Air Post Official (CO): CO1-CO30
    - Semi-Postal (B): B1-B10
    - Special Delivery (E): E1-E10
    - Postal Tax (RA): RA1-RA50
    
    Args:
        analyze_all: If False, only analyze matchable categories (ignore proofs, specimens)
        show_details: Show color and header details for gaps
    """
    from collections import defaultdict
    
    # Complete Scott category definitions with Costa Rica context
    SCOTT_CATEGORIES = {
        "": {"name": "Regular Issues", "mena_equiv": "(none)", "analyze": True, "typical_max": 733},
        "C": {"name": "Air Post (Airmail)", "mena_equiv": "A", "analyze": True, "typical_max": 940},
        "O": {"name": "Official", "mena_equiv": "O", "analyze": True, "typical_max": 75},
        "CO": {"name": "Air Post Official", "mena_equiv": "OA", "analyze": True, "typical_max": 30},
        "CE": {"name": "Air Post Special Delivery", "mena_equiv": "SD+A", "analyze": True, "typical_max": 10},
        "E": {"name": "Special Delivery", "mena_equiv": "SD", "analyze": True, "typical_max": 10},
        "J": {"name": "Postage Due", "mena_equiv": "D", "analyze": True, "typical_max": 50},
        "B": {"name": "Semi-Postal", "mena_equiv": "SP", "analyze": True, "typical_max": 10},
        "RA": {"name": "Postal Tax", "mena_equiv": "CT", "analyze": True, "typical_max": 50},
        "AR": {"name": "Postal Fiscal", "mena_equiv": "R", "analyze": True, "typical_max": 20},
        "G": {"name": "Guanacaste", "mena_equiv": "G", "analyze": True, "typical_max": 70},
        # Less common categories
        "F": {"name": "Registration", "mena_equiv": "RL", "analyze": analyze_all, "typical_max": 10},
        "Q": {"name": "Parcel Post", "mena_equiv": "-", "analyze": analyze_all, "typical_max": 10},
        "QE": {"name": "Parcel Post Special Delivery", "mena_equiv": "-", "analyze": analyze_all, "typical_max": 5},
    }
    
    # Group stamps by category
    by_category = defaultdict(list)
    
    for stamp in all_scott_stamps:
        scott_no = stamp.get('scott_number', '')
        year = extract_scott_year(stamp)
        
        # Parse catalog number
        cat, num, suffix = normalize_catalog_number(scott_no)
        
        # Only track base numbers (ignore varieties)
        if suffix == "":
            by_category[cat].append({
                'scott_number': scott_no,
                'numeric': int(num),
                'year': year,
                'denomination': stamp.get('denomination', ''),
                'color': stamp.get('color', ''),
                'header': stamp.get('header', '')
            })
    
    # Find gaps
    print("\n" + "="*80)
    print("SCOTT CATALOG GAP ANALYSIS - COSTA RICA")
    print("="*80)
    print("\nAnalyzing matchable categories (Regular, Airmail, Official, etc.)")
    print("Note: Each category has independent numbering:")
    print("      Regular: 1-733, Airmail: C1-C940, Official: O1-O75, etc.")
    print("="*80)
    
    total_gaps = 0
    total_missing = 0
    categories_with_gaps = []
    
    # Sort categories by common usage
    category_order = ["", "C", "O", "CO", "CE", "E", "J", "B", "RA", "AR", "G", "F", "Q", "QE"]
    
    for category in category_order:
        if category not in by_category:
            continue
        
        # Check if we should analyze this category
        cat_info = SCOTT_CATEGORIES.get(category, {"name": f"{category} Issues", "mena_equiv": "-", "analyze": True, "typical_max": 100})
        if not cat_info["analyze"]:
            continue
        
        stamps = sorted(by_category[category], key=lambda x: x['numeric'])
        
        if len(stamps) < 2:
            continue
        
        # Get range info
        min_num = stamps[0]['numeric']
        max_num = stamps[-1]['numeric']
        prefix = category if category else ""
        expected_total = max_num - min_num + 1
        
        # Find gaps
        gaps = []
        for i in range(len(stamps) - 1):
            current_num = stamps[i]['numeric']
            next_num = stamps[i + 1]['numeric']
            
            if next_num - current_num > 1:
                gap_start = current_num + 1
                gap_end = next_num - 1
                gaps.append({
                    'before': stamps[i],
                    'after': stamps[i + 1],
                    'gap_start': gap_start,
                    'gap_end': gap_end,
                    'gap_size': gap_end - gap_start + 1
                })
        
        # Print header for category
        cat_display = cat_info["name"]
        mena_equiv = cat_info["mena_equiv"]
        typical_max = cat_info["typical_max"]
        
        print(f"\n{'='*80}")
        print(f"{cat_display} (Scott: {prefix if prefix else '(none)'} | Mena: {mena_equiv})")
        print(f"{'='*80}")
        print(f"Range: {prefix}{min_num} to {prefix}{max_num} (typical max: ~{prefix}{typical_max})")
        print(f"Found: {len(stamps)} stamps | Expected if consecutive: {expected_total} stamps")
        
        # Check if we're missing a lot vs typical
        if max_num > typical_max * 1.5:
            print(f"⚠️  NOTE: Maximum number ({prefix}{max_num}) exceeds typical range (~{prefix}{typical_max})")
        
        if gaps:
            missing_count = sum(g['gap_size'] for g in gaps)
            total_missing += missing_count
            categories_with_gaps.append(cat_display)
            
            print(f"Status: ⚠️  {len(gaps)} gap(s) found ({missing_count} missing stamps)")
            print("-" * 80)
            
            for gap in gaps:
                total_gaps += 1
                
                # Format gap range
                if gap['gap_size'] == 1:
                    gap_display = f"{prefix}{gap['gap_start']}"
                else:
                    gap_display = f"{prefix}{gap['gap_start']}-{prefix}{gap['gap_end']}"
                
                # Determine severity
                severity = ""
                if gap['gap_size'] > 50:
                    severity = "🚨 CRITICAL - Very large gap! Possible parser error!"
                elif gap['gap_size'] > 20:
                    severity = "⚠️  WARNING - Large gap"
                elif gap['gap_size'] > 10:
                    severity = "⚠️  Moderate gap"
                elif gap['gap_start'] <= 3:
                    severity = "⚠️  Low numbers missing - verify correct"
                
                print(f"\n  Gap #{len([g for g in gaps if gaps.index(g) <= gaps.index(gap)])}: {gap_display}")
                print(f"    Missing: {gap['gap_size']} stamp{'s' if gap['gap_size'] > 1 else ''} {severity}")
                print(f"    Before: #{gap['before']['scott_number']} = {gap['before']['denomination']} ({gap['before']['year']})")
                print(f"    After:  #{gap['after']['scott_number']} = {gap['after']['denomination']} ({gap['after']['year']})")
                
                # Year analysis
                if gap['after']['year'] and gap['before']['year']:
                    year_diff = abs(gap['after']['year'] - gap['before']['year'])
                    if year_diff > 5:
                        print(f"    📅 {year_diff}-year gap between issues")
                    elif year_diff == 0:
                        print(f"    📅 Same year ({gap['before']['year']}) - likely intentional gap or reserved numbers")
                
                # Show details if requested
                if show_details:
                    print(f"    Details:")
                    print(f"      Before: {gap['before']['color']} | {gap['before']['header']}")
                    print(f"      After:  {gap['after']['color']} | {gap['after']['header']}")
        else:
            print(f"Status: ✓ Complete (no gaps - consecutive numbering)")
    
    # Summary
    print("\n" + "="*80)
    print("SUMMARY")
    print("="*80)
    print(f"Total gaps found: {total_gaps}")
    print(f"Total missing stamps: {total_missing}")
    
    if categories_with_gaps:
        print(f"\nCategories with gaps:")
        for cat in categories_with_gaps:
            print(f"  • {cat}")
        print(f"\nNote: Some gaps are normal (reserved numbers, stamps never issued)")
        print(f"      Gaps >50 stamps likely indicate parser errors")
    else:
        print("\n✓ All categories have consecutive numbering!")
    
    print("="*80)
    
    return {
        'total_gaps': total_gaps,
        'total_missing': total_missing,
        'categories_with_gaps': categories_with_gaps
    }

In [None]:
find_catalog_gaps_complete(all_scott_stamps)

## Approach 2 LLM Few Shot

In [118]:
import json
from landingai_ade import LandingAIADE
# Load environment variables 
from dotenv import load_dotenv
load_dotenv()
import re
import os
import json, traceback

In [119]:
"""
Mena–Scott Matcher (Costa Rica) — LLM-Driven, Schema-Consistent
Author: (Your Name)

- Input:
    * mena_issue: a parsed Mena JSON (must include issue_data.issue_id and stamps[])
    * scott_candidates: a list of candidate strings (raw or structured text lines)

- Output (ALWAYS this schema):
    {{
      "issue_id": "<Mena issue id>",
      "equivalences": [
        {{ "mena": "<Mena catalog_no>", "scott": "<Scott normalized>", "confidence": "low|medium|high" }}
      ]
    }}

- Notes:
  * Uses LangChain + an LLM to do the reasoning/matching (no regex scoring).
  * Enforces mapping conventions:
      - Mena prefixes to Scott families:
          A  -> C        (airmail)
          OA -> CO       (official airmail)
          O  -> O        (official)   **Scott leading '0' also means Official**
          D  -> J        (postage due)
          SD -> E        (special delivery)
          SP -> B        (semi-postal)
          CT -> RA       (postal tax / Christmas)
          G  -> G        (Guanacaste)
      - Regular issue: no prefix ↔ no prefix
      - It's OK to **strip leading letter(s)** to compare base numbers,
        but output must keep Scott's original prefix ("O", "0", "RA", etc.).
  * Temperature=0 and structured JSON enforced via JsonOutputParser.
"""

import json
import traceback
from typing import Dict, Any, List, Optional

from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate, FewShotChatMessagePromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain_community.callbacks import get_openai_callback


MATCHER_SYSTEM_PROMPT = """
You are MenaScottMatcher — a **strict, JSON-only** catalog equivalence matcher for Costa Rica stamps.
You receive:
  1) One Mena issue JSON (with issue_data, stamps, etc.)
  2) A list of Scott candidates as free-form strings

Your task:
- Produce ONE JSON only with EXACT schema:
  {{
    "issue_id": "<Mena issue id>",
    "equivalences": [
      {{ "mena": "<Mena catalog_no>", "scott": "<Scott number with its original prefix>", "confidence": "low|medium|high" }}
    ]
  }}

Rules:
- If no matches: return "equivalences": []
- Confidence must be one of: "low", "medium", "high".
- NEVER add extra keys.
- NEVER return commentary; ONLY the JSON object.

Family/prefix mapping rules (deterministic):
- Mena "A"  -> Scott "C"   (airmail)
- Mena "OA" -> Scott "CO"  (official airmail)
- Mena "O"  -> Scott "O"   (official). Scott leading "0" (zero) also means Official.
- Mena "D"  -> Scott "J"   (postage due)
- Mena "SD" -> Scott "E"   (special delivery)
- Mena "SP" -> Scott "B"   (semi-postal)
- Mena "CT" -> Scott "RA"  (postal tax / Christmas)
- Mena "G"  -> Scott "G"   (Guanacaste)
- Regular issue: no prefix ↔ no prefix.

Handling:
- You may **strip** leading letter prefixes (Mena and Scott) internally to compare base numbers
  but you MUST output Scott numbers with their original prefix (e.g., "O48", "RA37", "59").
- For Scott official numbers, treat "0" and "O" as equivalent **for matching only**. Output the original as shown in candidates.
- Prefer matches where denomination unit/value, year, and color are compatible.
- Souvenir sheets (SS*, SSA*) generally don't map to Scott core numbers unless a candidate explicitly corresponds.
- If data is insufficient/ambiguous, omit the equivalence (don't guess).

Output only the JSON.
"""

# --------------------------
# Few-shot examples
# --------------------------

# Example 1: Regular + Official mapping, same-year, clear color/denomination cues
FS_INPUT_1 = {
    "mena_issue": {
        "issue_data": {"issue_id": "CR-1907-ISSUE", "issue_dates": {"placed_on_sale": "1907-09-29"}},
        "stamps": [
            {"catalog_no": "59", "status": "regular",
             "denomination": {"value": 10, "unit": "c"}, "color": "blue & black"},
            {"catalog_no": "O48", "status": "official",
             "denomination": {"value": 1, "unit": "c"}, "color": "red brown & indigo"}
        ]
    },
    "scott_candidates": [
        "Scott #59: 10c blue & black (year=1907)",
        "Scott #O48: 1c red brn & ind (year=1907)"
    ]
}
FS_OUTPUT_1 = {
    "issue_id": "CR-1907-ISSUE",
    "equivalences": [
        {"mena": "59", "scott": "59", "confidence": "high"},
        {"mena": "O48", "scott": "O48", "confidence": "high"}
    ]
}

# Example 2: Christmas Postal Tax CT ↔ RA
FS_INPUT_2 = {
    "mena_issue": {
        "issue_data": {"issue_id": "CR-1968-CHRISTMAS-TAX", "issue_dates": {"placed_on_sale": "1968-12-01"}},
        "stamps": [
            {"catalog_no": "CT37", "status": "postal_tax",
             "denomination": {"value": 5, "unit": "c"}, "color": "gray"},
            {"catalog_no": "CT38", "status": "postal_tax",
             "denomination": {"value": 5, "unit": "c"}, "color": "rose red"}
        ]
    },
    "scott_candidates": [
        "Scott #RA37: 5c gray (year=1968)",
        "Scott #RA38: 5c rose red (year=1968)"
    ]
}
FS_OUTPUT_2 = {
    "issue_id": "CR-1968-CHRISTMAS-TAX",
    "equivalences": [
        {"mena": "CT37", "scott": "RA37", "confidence": "high"},
        {"mena": "CT38", "scott": "RA38", "confidence": "high"}
    ]
}

# Example 3: Souvenir Sheet case — no suitable Scott mapping ⇒ empty array
FS_INPUT_3 = {
    "mena_issue": {
        "issue_data": {"issue_id": "CR-1968-III-PHILATELIC-EXHIBITION-OVERPRINT",
                       "issue_dates": {"announced": "1968-08-01"}},
        "stamps": [
            {"catalog_no": "SSA497", "status": "souvenir_sheet",
             "denomination": {"value": None, "unit": "sheet"}, "color": "multicolor", "perforation": "13.5"},
            {"catalog_no": "SSA497a", "status": "souvenir_sheet",
             "denomination": {"value": None, "unit": "sheet"}, "color": "multicolor", "perforation": ""}
        ]
    },
    "scott_candidates": [
        "Scott #C475: 15c lt bl, blk & lt brn (year=1968)",
        "Scott #RA37: 5c gray (year=1968)"
    ]
}
FS_OUTPUT_3 = {
    "issue_id": "CR-1968-III-PHILATELIC-EXHIBITION-OVERPRINT",
    "equivalences": []
}


def _json(obj: Any) -> str:
    return json.dumps(obj, ensure_ascii=False)


def _few_shot_block():
    example_prompt = ChatPromptTemplate.from_messages([
        ("human", "{input}"),
        ("ai", "{output}")
    ])
    return FewShotChatMessagePromptTemplate(
        example_prompt=example_prompt,
        examples=[
            {"input": _json(FS_INPUT_1), "output": _json(FS_OUTPUT_1)},
            {"input": _json(FS_INPUT_2), "output": _json(FS_OUTPUT_2)},
            {"input": _json(FS_INPUT_3), "output": _json(FS_OUTPUT_3)},
        ],
    )


class MenaScottMatcher:
    """
    LLM-driven matcher that returns a stable, minimal schema:

    {
      "issue_id": "<Mena issue id>",
      "equivalences": [
        { "mena": "<Mena catalog_no>", "scott": "<Scott number>", "confidence": "low|medium|high" }
      ]
    }
    """

    def __init__(
        self,
        openai_api_key: str,
        model_name: str = "gpt-5-mini",
        temperature: float = 1,
    ):
        self.llm = ChatOpenAI(
            model=model_name,
            temperature=temperature,
            api_key=openai_api_key,
            timeout=180.0,
            model_kwargs={
                "verbosity": "low",
                "reasoning_effort": "low",
            }
        )
        self.parser = JsonOutputParser()
        self.chain = self._create_chain()

    def _create_chain(self):
        sys = MATCHER_SYSTEM_PROMPT
        few = _few_shot_block()
        user = ChatPromptTemplate.from_messages([
            ("system", sys),
            few,
            ("human", "{payload}")  # single unified payload per call
        ])
        return user | self.llm | self.parser

    def match(self, mena_issue: dict, scott_candidates: list) -> dict:
        payload = {"mena_issue": mena_issue, "scott_candidates": scott_candidates}
        fallback = {
            "issue_id": (mena_issue.get("issue_data") or {}).get("issue_id", "") or "",
            "equivalences": []
        }

        try:
            with get_openai_callback() as cb:
                result = self.chain.invoke({
                    "payload": json.dumps(payload, ensure_ascii=False)
                })
                print(
                    f"[Callback] prompt_tokens={cb.prompt_tokens} "
                    f"completion_tokens={cb.completion_tokens} "
                    f"total_tokens={cb.total_tokens} "
                    f"total_cost={cb.total_cost}"
                )
        except Exception as e:
            print("LLM/Parsing error:", repr(e))
            traceback.print_exc()
            return fallback

        # Sanitiza la salida (por si acaso)
        issue_id = result.get("issue_id") or fallback["issue_id"]
        eq = result.get("equivalences")
        if not isinstance(eq, list):
            eq = []
        out = []
        for item in eq:
            if not isinstance(item, dict):
                continue
            mena = str(item.get("mena", "")).strip()
            scott = str(item.get("scott", "")).strip()
            conf = str(item.get("confidence", "low")).lower()
            if conf not in ("low", "medium", "high"):
                conf = "low"
            if mena and scott:
                out.append({"mena": mena, "scott": scott, "confidence": conf})
        return {"issue_id": issue_id, "equivalences": out}

### Get Mena Issue

In [90]:
# Load Mena issue
PATH = Path("results/parsed_catalogues/mena_parse_results_ALL.json")

# Cargar
with PATH.open("r", encoding="utf-8") as f:
    mena_parsed_catalog = json.load(f)

In [123]:
mena_issue = mena_parsed_catalog[25]
print(f"Loaded Mena issue: {mena_issue['issue_data']['title']}")
print(f"Number of stamps: {len(mena_issue['stamps'])}")
print(mena_issue)

Loaded Mena issue: Overprint "Compre Ud. Cafe de Costa Rica" in circle
Number of stamps: 1
{'issue_data': {'issue_id': 'CR-1923-COFFEE-OVERPRINT', 'section': 'Surface Mail', 'title': 'Overprint "Compre Ud. Cafe de Costa Rica" in circle', 'country': 'Costa Rica', 'issue_dates': {'announced': None, 'placed_on_sale': None, 'probable_first_circulation': '1923-01-13', 'second_plate_sale': None, 'demonetized': None}, 'legal_basis': [{'type': 'decree', 'id': 'Decree #6', 'date': '1923-01-12', 'ids': [], 'officials': []}], 'currency_context': {'original': '', 'decimal_adoption': None, 'revaluation_date': None, 'revaluation_map': {}}, 'printing': {'printer': 'Imprenta Nacional', 'process': ['typography'], 'format': {'panes': 100}, 'plates': {'113': {'plates': [1, 2, 3], 'notes': 'printed with 3 plates: a = no varieties; b = varieties as listed; c = varieties plus pos 79 and 80 below the line'}}}, 'perforation': '12'}, 'production_orders': {'printings': [], 'remainders': {'date': None, 'note': '

### Get Scott Candidates

In [92]:
# Load Scott catalog (grouped structure)
PATH = Path("results/parsed_catalogues/scott_parse_results_ALL.json")

# Cargar
with PATH.open("r", encoding="utf-8") as f:
    scott_grouped = json.load(f)

print(f"Loaded Scott catalog: {len(scott_grouped)} issue groups")

Loaded Scott catalog: 1086 issue groups


In [93]:
# CRITICAL STEP: Flatten and enrich Scott data
all_scott_stamps = flatten_and_enrich_scott_data(scott_grouped)

print(f"Preprocessed: {len(all_scott_stamps)} total stamps")
print(f"\nExample enriched variety stamp (Scott #1a):")
for stamp in all_scott_stamps[:10]:
    if stamp.get('scott_number') == '1a' and stamp.get('year') == 1863:
        print(f"  denomination: {stamp.get('denomination')}")
        print(f"  color: {stamp.get('color')}")
        print(f"  variety_of: {stamp.get('variety_of')}")
        break

Preprocessed: 2559 total stamps

Example enriched variety stamp (Scott #1a):
  denomination: ½r
  color: light blue
  variety_of: 1


In [124]:
scott_raw_candidates = build_candidate_pool(mena_issue, all_scott_stamps, 2)
scott_str_candidates = []
for c in scott_raw_candidates:
    scott_str_candidates.append(f"  Scott #{c.get('scott_number')}: {c.get('denomination')} {c.get('color')} (year={extract_scott_year(c)})")

Found 84 Scott candidates for year 1923 (±2 years)
Excluded 366 stamps without year information
Year distribution: {1921: 20, 1922: 9, 1923: 33, 1924: 17, 1925: 5}

All candidates overall:
  Scott #103: 5c bl & blk (year=1921)
  Scott #103a: 5c bl & blk (year=1921)
  Scott #103b: 5c bl & blk (year=1921)
  Scott #103c: 5c bl & blk (year=1921)
  Scott #104: 5c violet (year=1921)
  Scott #104a: 5c violet (year=1921)
  Scott #105: 2c orange & blk (year=1921)
  Scott #106: 3c green & blk (year=1921)
  Scott #107: 6c scarlet & blk (year=1921)
  Scott #108: 15c dk blue & blk (year=1921)
  Scott #109: 30c orange brn & blk (year=1921)
  Scott #110: 15c deep violet (year=1921)
  Scott #111: 5c violet (year=1922)
  Scott #111a: 5c violet (year=1922)
  Scott #111b: 5c violet (year=1922)
  Scott #111C: 1c brown (Bl) (year=1922)
  Scott #111D: 2c deep green (R) (year=1922)
  Scott #111E: 4c scarlet (year=1922)
  Scott #111F: 5c orange (year=1922)
  Scott #111G: 10c deep blue (R) (year=1922)
  Scott 

### Test

In [125]:
# --------------------------
# Example usage
# --------------------------

# Replace with your real key via env or secret manager
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "your-api-key")

matcher = MenaScottMatcher(openai_api_key=OPENAI_API_KEY)

# Demo 1: matches regular + official
mena_issue_demo = mena_issue
candidates_demo = scott_str_candidates
result = matcher.match(mena_issue_demo, candidates_demo)
print(result)

  exec(code_obj, self.user_global_ns, self.user_ns)


[Callback] prompt_tokens=3431 completion_tokens=301 total_tokens=3732 total_cost=0.0
{'issue_id': 'CR-1923-COFFEE-OVERPRINT', 'equivalences': [{'mena': '113', 'scott': '111J', 'confidence': 'high'}]}
