## Mena to Scott Catalog Matcher

Mena to Scott Catalog Matching Algorithm

Matches stamps from Mena catalog to Scott catalog using multi-signal scoring


In [None]:
from pathlib import Path
import json
from typing import Dict, List, Any, Tuple, Optional
from dataclasses import dataclass
from datetime import datetime
import re
from difflib import SequenceMatcher

In [None]:
# Ruta al archivo unido
PATH = Path("results/parsed_catalogues/scott_parse_results_ALL.json")

# Cargar
with PATH.open("r", encoding="utf-8") as f:
    items = json.load(f)

# Conteo
print(f"Total de items: {len(items)}")

# Iterar (imprime un resumen por cada elemento)
for i, item in enumerate(items, start=1):
    stamps = (item.get("stamps") or [])
    for stamp in stamps:
        print(stamp.get("scott_number"))

### Data Stuctures

In [None]:
@dataclass
class MatchResult:
    """Represents a match between Mena and Scott catalogs"""
    mena_catalog_no: str
    scott_number: str
    confidence: str
    score: float
    signals: Dict[str, float]
    breakdown: str
    boost_reasons: List[str]
    requires_review: bool


@dataclass
class UnmatchedEntry:
    """Represents an unmatched catalog entry"""
    catalog_no: str
    denomination: str
    color: str
    reason: str

### Normalization Dictionaries

In [None]:
COLOR_ABBREVIATIONS = {
    # Scott abbreviations → Standard
    "pl brn": "pale brown",
    "dk brn": "dark brown",
    "lt bl": "light blue",
    "dk bl": "dark blue",
    "org": "orange",
    "grn": "green",
    "dk grn": "dark green",
    "lt grn": "light green",
    "yel": "yellow",
    "blk": "black",
    "scar": "scarlet",
    "car": "carmine",
    "vio": "violet",
    "pur": "purple",
    "brn": "brown",
    "ol": "olive",
    "org red": "orange red",
    "red brn": "red brown",
    "gray": "grey",
    "grn bl": "green blue",
    "bl grn": "blue green",
    "red org": "red orange",
    "yel grn": "yellow green",
    "brn red": "brown red",
    
    # Common Mena terms → Standard
    "pale brown": "pale brown",
    "light blue": "light blue",
    "dark brown": "dark brown",
    "dark blue": "dark blue",
    "light green": "light green",
    "dark green": "dark green",
    "orange red": "orange red",
    "red brown": "red brown",
    "reddish brown": "red brown",
    "reddish purple": "red purple",
    "blue green": "blue green",
    "green blue": "green blue",
}

COLOR_FAMILIES = {
    "blue_family": [
        "blue", "light blue", "dark blue", "pale blue", "ultramarine", 
        "azure", "sky blue", "cobalt", "navy"
    ],
    "red_family": [
        "red", "scarlet", "carmine", "rose", "vermillion", "crimson", 
        "red orange", "red brown", "red purple"
    ],
    "yellow_family": [
        "yellow", "orange", "lemon", "gold", "amber", "yellow green"
    ],
    "green_family": [
        "green", "light green", "dark green", "olive", "emerald", 
        "blue green", "green blue", "yellow green"
    ],
    "brown_family": [
        "brown", "pale brown", "dark brown", "sepia", "chocolate", 
        "red brown", "brown red"
    ],
    "purple_family": [
        "purple", "violet", "mauve", "red purple", "reddish purple"
    ],
    "black_family": ["black", "grey", "gray"],
    "white_family": ["white"]
}

DENOMINATION_ABBREVIATIONS = {
    "r": "real",
    "½r": "half real",
    "c": "centavo",
    "ct": "centavo",
    "cts": "centavos",
    "p": "peso",
    "ps": "pesos",
}

### Normalization functions

In [None]:
def normalize_color(color_string: str) -> str:
    """
    Normalize color strings to standard format
    
    Args:
        color_string: Raw color string (e.g., "pl brn", "pale brown")
    
    Returns:
        Normalized color string
    """
    if not color_string:
        return ""
    
    color_lower = color_string.lower().strip()
    
    # Check abbreviation dictionary
    if color_lower in COLOR_ABBREVIATIONS:
        return COLOR_ABBREVIATIONS[color_lower]
    
    # Remove extra whitespace
    color_lower = " ".join(color_lower.split())
    
    return color_lower


def find_color_family(color: str) -> Optional[str]:
    """
    Find which color family a color belongs to
    
    Args:
        color: Normalized color string
    
    Returns:
        Family name or None
    """
    color_normalized = normalize_color(color)
    
    for family, colors in COLOR_FAMILIES.items():
        if color_normalized in colors:
            return family
    
    return None


def calculate_color_family_similarity(color1: str, color2: str) -> float:
    """
    Calculate similarity between two colors based on color families
    
    Args:
        color1: First color (normalized)
        color2: Second color (normalized)
    
    Returns:
        Similarity score (0.0 to 1.0)
    """
    norm1 = normalize_color(color1)
    norm2 = normalize_color(color2)
    
    # Exact match
    if norm1 == norm2:
        return 1.0
    
    # Check color families
    family1 = find_color_family(norm1)
    family2 = find_color_family(norm2)
    
    if family1 and family2 and family1 == family2:
        # Same family - high similarity
        return 0.85
    
    # Use string similarity as fallback
    return SequenceMatcher(None, norm1, norm2).ratio()


def parse_denomination_string(denom_string: str) -> Dict[str, Any]:
    """
    Parse denomination string to extract value and unit
    
    Args:
        denom_string: String like "5c", "½r", "2 reales", "1p"
    
    Returns:
        Dictionary with 'value' and 'unit'
    """
    if not denom_string:
        return {"value": None, "unit": None}
    
    denom_string = denom_string.lower().strip()
    
    # Handle special case: ½
    if "½" in denom_string:
        value = 0.5
        # Extract unit
        unit = re.sub(r'[½\d\s.]', '', denom_string)
    else:
        # Extract numeric value
        match = re.search(r'(\d+\.?\d*)', denom_string)
        if match:
            value = float(match.group(1))
        else:
            return {"value": None, "unit": None}
        
        # Extract unit
        unit = re.sub(r'[\d\s.]', '', denom_string)
    
    # Normalize unit
    if unit in DENOMINATION_ABBREVIATIONS:
        unit = DENOMINATION_ABBREVIATIONS[unit]
    
    return {"value": value, "unit": unit}


def normalize_denomination(value: float, unit: str) -> Dict[str, Any]:
    """
    Normalize denomination to standard format
    
    Args:
        value: Numeric value
        unit: Unit string (real, peso, centavo, etc.)
    
    Returns:
        Dictionary with normalized value and unit
    """
    # Normalize unit names
    unit_normalized = unit.lower().strip()
    
    # Handle plurals
    if unit_normalized.endswith('s'):
        unit_normalized = unit_normalized[:-1]
    
    # Map abbreviations
    if unit_normalized in DENOMINATION_ABBREVIATIONS:
        unit_normalized = DENOMINATION_ABBREVIATIONS[unit_normalized]
    
    # Special handling for "P" -> "peso"
    if unit_normalized == "p":
        unit_normalized = "peso"
    
    return {
        "value": value,
        "unit": unit_normalized
    }


def normalize_perforation(perf_string: str) -> str:
    """
    Normalize perforation string
    
    Args:
        perf_string: Perforation string like "12", "Perf. 12", "Perf 12½"
    
    Returns:
        Normalized perforation string
    """
    if not perf_string:
        return ""
    
    # Extract numbers
    numbers = re.findall(r'[\d½]+', perf_string)
    return " ".join(numbers)

### Year Extraction

In [None]:
def extract_primary_year(issue_dates: Dict[str, Any]) -> Optional[int]:
    """
    Extract the primary year from Mena issue dates
    
    Args:
        issue_dates: Dictionary of issue dates
    
    Returns:
        Primary year as integer
    """
    # Priority order for date selection
    date_priorities = [
        'placed_on_sale',
        'probable_first_circulation',
        'announced',
        'second_plate_sale'
    ]
    
    for date_key in date_priorities:
        if date_key in issue_dates and issue_dates[date_key]:
            date_str = issue_dates[date_key]
            if isinstance(date_str, str):
                # Extract year from ISO date format
                match = re.search(r'(\d{4})', date_str)
                if match:
                    return int(match.group(1))
    
    return None


def extract_scott_year(scott_stamp: Dict[str, Any]) -> Optional[int]:
    """
    Extract year from Scott stamp entry
    
    Args:
        scott_stamp: Scott stamp dictionary
    
    Returns:
        Year as integer
    """
    # Try 'year' field first
    if 'year' in scott_stamp and scott_stamp['year']:
        return int(scott_stamp['year'])
    
    # Try 'header' field
    if 'header' in scott_stamp and scott_stamp['header']:
        header = scott_stamp['header']
        # Extract year from header like "1863" or "1863, Apr. 11"
        match = re.search(r'(\d{4})', header)
        if match:
            return int(match.group(1))
    
    return None

### Candidate Pool Building

In [None]:
def build_candidate_pool(
    mena_issue: Dict[str, Any],
    all_scott_stamps: List[Dict[str, Any]],
    year_tolerance: int = 2
) -> List[Dict[str, Any]]:
    """
    Build a pool of Scott stamp candidates based on year
    
    Args:
        mena_issue: Complete Mena issue data
        all_scott_stamps: List of all Scott stamps
        year_tolerance: Years +/- to include (default: 2)
    
    Returns:
        List of candidate Scott stamps
    """
    primary_year = extract_primary_year(mena_issue['issue_data']['issue_dates'])
    
    if not primary_year:
        print("Warning: Could not extract year from Mena issue")
        return all_scott_stamps  # Return all if no year found
    
    candidates = []
    
    for scott_stamp in all_scott_stamps:
        scott_year = extract_scott_year(scott_stamp)
        
        if scott_year is None:
            # Include stamps without year info
            candidates.append(scott_stamp)
        elif abs(scott_year - primary_year) <= year_tolerance:
            candidates.append(scott_stamp)
    
    print(f"Found {len(candidates)} Scott candidates for year {primary_year} (±{year_tolerance} years)")
    
    return candidates

### Multi Signal Scoring

In [None]:
def calculate_match_score(
    mena_stamp: Dict[str, Any],
    scott_stamp: Dict[str, Any],
    mena_issue_context: Dict[str, Any]
) -> Dict[str, Any]:
    """
    Calculate match score using multiple signals
    
    Args:
        mena_stamp: Single Mena stamp entry
        scott_stamp: Single Scott stamp entry
        mena_issue_context: Issue-level context from Mena
    
    Returns:
        Dictionary with total_score, signals, and breakdown
    """
    signals = {}
    total_score = 0.0
    breakdown_parts = []
    
    # =========================================================================
    # SIGNAL 1: DENOMINATION MATCH (35 points)
    # =========================================================================
    mena_denom = normalize_denomination(
        mena_stamp['denomination']['value'],
        mena_stamp['denomination']['unit']
    )
    
    scott_denom_str = scott_stamp.get('denomination', '')
    scott_denom = parse_denomination_string(scott_denom_str)
    
    if (mena_denom['value'] == scott_denom['value'] and 
        mena_denom['unit'] == scott_denom['unit']):
        signals['denomination'] = 35
        total_score += 35
        breakdown_parts.append(f"Denom: {mena_denom['value']}{mena_denom['unit'][:1]} = {scott_denom_str} ✓")
    elif mena_denom['unit'] == scott_denom['unit'] and mena_denom['value']:
        # Same unit, different value
        diff_percentage = abs(mena_denom['value'] - scott_denom['value']) / mena_denom['value']
        if diff_percentage < 0.2:
            signals['denomination'] = 20
            total_score += 20
            breakdown_parts.append(f"Denom: similar (~{int((1-diff_percentage)*100)}%)")
        else:
            signals['denomination'] = 0
            breakdown_parts.append(f"Denom: mismatch")
    else:
        signals['denomination'] = 0
        breakdown_parts.append(f"Denom: no match")
    
    # =========================================================================
    # SIGNAL 2: COLOR MATCH (30 points)
    # =========================================================================
    mena_color = mena_stamp.get('color', '')
    scott_color = scott_stamp.get('color', '')
    
    if mena_color and scott_color:
        color_similarity = calculate_color_family_similarity(mena_color, scott_color)
        color_score = color_similarity * 30
        signals['color'] = color_score
        total_score += color_score
        
        if color_similarity >= 0.95:
            breakdown_parts.append(f"Color: {mena_color} = {scott_color} ✓")
        elif color_similarity >= 0.80:
            breakdown_parts.append(f"Color: {mena_color} ≈ {scott_color} ({int(color_similarity*100)}%)")
        else:
            breakdown_parts.append(f"Color: {mena_color} vs {scott_color} ({int(color_similarity*100)}%)")
    else:
        signals['color'] = 0
        breakdown_parts.append("Color: missing data")
    
    # =========================================================================
    # SIGNAL 3: YEAR PROXIMITY (20 points)
    # =========================================================================
    mena_year = extract_primary_year(mena_issue_context['issue_dates'])
    scott_year = extract_scott_year(scott_stamp)
    
    if mena_year and scott_year:
        year_diff = abs(mena_year - scott_year)
        
        if year_diff == 0:
            signals['year'] = 20
            total_score += 20
            breakdown_parts.append(f"Year: {mena_year} ✓")
        elif year_diff == 1:
            signals['year'] = 15
            total_score += 15
            breakdown_parts.append(f"Year: {mena_year} vs {scott_year} (±1)")
        elif year_diff == 2:
            signals['year'] = 10
            total_score += 10
            breakdown_parts.append(f"Year: {mena_year} vs {scott_year} (±2)")
        else:
            signals['year'] = 0
            breakdown_parts.append(f"Year: {mena_year} vs {scott_year} (>{year_diff})")
    else:
        signals['year'] = 0
        breakdown_parts.append("Year: missing data")
    
    # =========================================================================
    # SIGNAL 4: PERFORATION MATCH (10 points)
    # =========================================================================
    mena_perf = mena_stamp.get('perforation', '')
    scott_perf = scott_stamp.get('perforation', '')
    
    if mena_perf and scott_perf:
        mena_perf_norm = normalize_perforation(str(mena_perf))
        scott_perf_norm = normalize_perforation(str(scott_perf))
        
        if mena_perf_norm == scott_perf_norm:
            signals['perforation'] = 10
            total_score += 10
            breakdown_parts.append(f"Perf: {mena_perf} ✓")
        else:
            signals['perforation'] = 0
            breakdown_parts.append(f"Perf: {mena_perf} vs {scott_perf}")
    else:
        signals['perforation'] = 0
    
    # =========================================================================
    # SIGNAL 5: ILLUSTRATION/DESIGN (5 points)
    # =========================================================================
    scott_illustration = scott_stamp.get('illustration', '')
    if scott_illustration:
        signals['illustration'] = 5
        total_score += 5
        breakdown_parts.append(f"Illus: {scott_illustration}")
    else:
        signals['illustration'] = 0
    
    return {
        'total_score': total_score,
        'signals': signals,
        'breakdown': " | ".join(breakdown_parts)
    }


### Scoring Matrix

In [None]:
def score_all_candidates(
    mena_issue: Dict[str, Any],
    scott_candidate_pool: List[Dict[str, Any]],
    min_threshold: float = 30.0
) -> List[Dict[str, Any]]:
    """
    Score all Mena stamps against all Scott candidates
    
    Args:
        mena_issue: Complete Mena issue
        scott_candidate_pool: List of Scott candidate stamps
        min_threshold: Minimum score to keep (default: 30)
    
    Returns:
        List of scoring rows (one per Mena stamp)
    """
    scoring_matrix = []
    
    for mena_stamp in mena_issue['stamps']:
        mena_row = {
            'mena_catalog_no': mena_stamp['catalog_no'],
            'mena_stamp': mena_stamp,
            'candidates': []
        }
        
        for scott_candidate in scott_candidate_pool:
            score_result = calculate_match_score(
                mena_stamp,
                scott_candidate,
                mena_issue['issue_data']
            )
            
            # Only keep candidates above threshold
            if score_result['total_score'] >= min_threshold:
                mena_row['candidates'].append({
                    'scott_number': scott_candidate.get('scott_number', 'UNKNOWN'),
                    'scott_stamp': scott_candidate,
                    'score': score_result['total_score'],
                    'signals': score_result['signals'],
                    'breakdown': score_result['breakdown']
                })
        
        # Sort candidates by score (descending)
        mena_row['candidates'].sort(key=lambda x: x['score'], reverse=True)
        
        scoring_matrix.append(mena_row)
    
    return scoring_matrix

## Optimal Assignment

In [None]:
def find_optimal_assignment(scoring_matrix: List[Dict[str, Any]]) -> List[MatchResult]:
    """
    Find optimal one-to-one assignment between Mena and Scott stamps
    
    Args:
        scoring_matrix: Matrix of all possible matches with scores
    
    Returns:
        List of MatchResult objects
    """
    assignments = []
    used_scott_numbers = set()
    assigned_mena = set()
    
    # Flatten all matches
    all_matches = []
    for mena_row in scoring_matrix:
        for candidate in mena_row['candidates']:
            all_matches.append({
                'mena_catalog_no': mena_row['mena_catalog_no'],
                'scott_number': candidate['scott_number'],
                'score': candidate['score'],
                'signals': candidate['signals'],
                'breakdown': candidate['breakdown']
            })
    
    # Sort by score (descending)
    all_matches.sort(key=lambda x: x['score'], reverse=True)
    
    # Greedy assignment
    for match in all_matches:
        mena_no = match['mena_catalog_no']
        scott_no = match['scott_number']
        
        # Check if already assigned
        if mena_no in assigned_mena or scott_no in used_scott_numbers:
            continue
        
        # Determine confidence level
        score = match['score']
        if score >= 70:
            confidence = "HIGH"
            requires_review = False
        elif score >= 50:
            confidence = "MEDIUM"
            requires_review = True
        else:
            confidence = "LOW"
            requires_review = True
        
        # Create match result
        result = MatchResult(
            mena_catalog_no=mena_no,
            scott_number=scott_no,
            confidence=confidence,
            score=score,
            signals=match['signals'],
            breakdown=match['breakdown'],
            boost_reasons=[],
            requires_review=requires_review
        )
        
        assignments.append(result)
        used_scott_numbers.add(scott_no)
        assigned_mena.add(mena_no)
    
    return assignments

### Validation and Confidence Boosting

In [None]:
def check_sequential_pattern(assignments: List[MatchResult]) -> bool:
    """
    Check if assignments follow a sequential numbering pattern
    
    Args:
        assignments: List of match results
    
    Returns:
        True if sequential pattern detected
    """
    if len(assignments) < 3:
        return False
    
    # Sort by Mena catalog number
    sorted_assignments = sorted(assignments, key=lambda x: x.mena_catalog_no)
    
    # Check if Scott numbers also increase
    scott_numbers = []
    for assignment in sorted_assignments:
        try:
            # Extract numeric part of Scott number
            match = re.search(r'(\d+)', assignment.scott_number)
            if match:
                scott_numbers.append(int(match.group(1)))
        except:
            pass
    
    if len(scott_numbers) < 3:
        return False
    
    # Check if mostly increasing
    increasing_count = sum(1 for i in range(len(scott_numbers)-1) 
                          if scott_numbers[i] < scott_numbers[i+1])
    
    return increasing_count >= (len(scott_numbers) - 1) * 0.7


def check_denomination_progression(assignments: List[MatchResult]) -> bool:
    """
    Check if denominations follow logical progression
    
    Args:
        assignments: List of match results
    
    Returns:
        True if logical progression detected
    """
    if len(assignments) < 2:
        return False
    
    # This is a simplified check
    # Could be enhanced to check actual denomination values
    return len(assignments) >= 3


def validate_and_boost_confidence(
    assignments: List[MatchResult],
    mena_issue: Dict[str, Any]
) -> List[MatchResult]:
    """
    Validate assignments and boost confidence based on patterns
    
    Args:
        assignments: List of match results
        mena_issue: Original Mena issue data
    
    Returns:
        Updated list of match results
    """
    # Check patterns
    has_sequential = check_sequential_pattern(assignments)
    has_denom_progression = check_denomination_progression(assignments)
    
    for assignment in assignments:
        confidence_boost = 0
        boost_reasons = []
        
        if has_sequential:
            confidence_boost += 10
            boost_reasons.append("Sequential number pattern")
        
        if has_denom_progression:
            confidence_boost += 5
            boost_reasons.append("Denomination progression")
        
        # Apply boost
        assignment.score += confidence_boost
        assignment.boost_reasons = boost_reasons
        
        # Recalculate confidence level
        if assignment.score >= 80:
            assignment.confidence = "HIGH"
            assignment.requires_review = False
        elif assignment.score >= 60:
            assignment.confidence = "MEDIUM"
            assignment.requires_review = True
        else:
            assignment.confidence = "LOW"
            assignment.requires_review = True
    
    return assignments

### Unmatched Identification

In [None]:
def identify_unmatched(
    assignments: List[MatchResult],
    mena_issue: Dict[str, Any],
    scott_candidate_pool: List[Dict[str, Any]]
) -> Dict[str, List[UnmatchedEntry]]:
    """
    Identify unmatched entries in both catalogs
    
    Args:
        assignments: List of successful matches
        mena_issue: Mena issue data
        scott_candidate_pool: Scott candidates
    
    Returns:
        Dictionary with unmatched_mena and unmatched_scott lists
    """
    # Find unmatched Mena stamps
    matched_mena_numbers = {a.mena_catalog_no for a in assignments}
    unmatched_mena = []
    
    for stamp in mena_issue['stamps']:
        if stamp['catalog_no'] not in matched_mena_numbers:
            unmatched_mena.append(UnmatchedEntry(
                catalog_no=stamp['catalog_no'],
                denomination=f"{stamp['denomination']['value']}{stamp['denomination']['unit']}",
                color=stamp.get('color', 'N/A'),
                reason="No suitable Scott match found"
            ))
    
    # Find unmatched Scott stamps
    matched_scott_numbers = {a.scott_number for a in assignments}
    unmatched_scott = []
    
    for stamp in scott_candidate_pool:
        scott_no = stamp.get('scott_number', 'UNKNOWN')
        if scott_no not in matched_scott_numbers:
            unmatched_scott.append(UnmatchedEntry(
                catalog_no=scott_no,
                denomination=stamp.get('denomination', 'N/A'),
                color=stamp.get('color', 'N/A'),
                reason="No Mena equivalent found"
            ))
    
    return {
        'unmatched_mena': unmatched_mena,
        'unmatched_scott': unmatched_scott
    }


### Main Function

In [None]:
def match_mena_to_scott(
    mena_issue: Dict[str, Any],
    all_scott_stamps: List[Dict[str, Any]],
    year_tolerance: int = 2,
    min_score_threshold: float = 30.0
) -> Dict[str, Any]:
    """
    Main function to match Mena issue to Scott catalog
    
    Args:
        mena_issue: Complete Mena issue data structure
        all_scott_stamps: List of all Scott stamp entries
        year_tolerance: Years +/- to include in candidate pool
        min_score_threshold: Minimum score to consider a match
    
    Returns:
        Dictionary with complete matching results
    """
    print("\n" + "="*80)
    print("MENA TO SCOTT CATALOG MATCHING")
    print("="*80)
    
    # Phase 1: Build candidate pool
    print("\n[Phase 1] Building candidate pool...")
    scott_candidates = build_candidate_pool(mena_issue, all_scott_stamps, year_tolerance)
    
    # Phase 2: Score all candidates
    print(f"\n[Phase 2] Scoring {len(mena_issue['stamps'])} Mena stamps against {len(scott_candidates)} Scott candidates...")
    scoring_matrix = score_all_candidates(mena_issue, scott_candidates, min_score_threshold)
    
    # Print scoring summary
    for row in scoring_matrix:
        print(f"  Mena #{row['mena_catalog_no']}: {len(row['candidates'])} candidates found")
    
    # Phase 3: Find optimal assignment
    print("\n[Phase 3] Finding optimal assignment...")
    assignments = find_optimal_assignment(scoring_matrix)
    print(f"  Created {len(assignments)} initial assignments")
    
    # Phase 4: Validate and boost
    print("\n[Phase 4] Validating and boosting confidence...")
    assignments = validate_and_boost_confidence(assignments, mena_issue)
    
    # Phase 5: Identify unmatched
    print("\n[Phase 5] Identifying unmatched entries...")
    unmatched = identify_unmatched(assignments, mena_issue, scott_candidates)
    print(f"  Unmatched Mena: {len(unmatched['unmatched_mena'])}")
    print(f"  Unmatched Scott: {len(unmatched['unmatched_scott'])}")
    
    # Calculate statistics
    total_mena = len(mena_issue['stamps'])
    high_conf = sum(1 for a in assignments if a.confidence == "HIGH")
    medium_conf = sum(1 for a in assignments if a.confidence == "MEDIUM")
    low_conf = sum(1 for a in assignments if a.confidence == "LOW")
    success_rate = (len(assignments) / total_mena * 100) if total_mena > 0 else 0
    
    statistics = {
        'total_mena_stamps': total_mena,
        'total_assignments': len(assignments),
        'high_confidence': high_conf,
        'medium_confidence': medium_conf,
        'low_confidence': low_conf,
        'unmatched_mena': len(unmatched['unmatched_mena']),
        'unmatched_scott': len(unmatched['unmatched_scott']),
        'success_rate': round(success_rate, 1)
    }
    
    # Build result
    result = {
        'issue_match': {
            'mena_issue_id': mena_issue['issue_data']['issue_id'],
            'mena_title': mena_issue['issue_data']['title'],
            'mena_year': extract_primary_year(mena_issue['issue_data']['issue_dates']),
            'candidate_pool_size': len(scott_candidates)
        },
        'assignments': [
            {
                'mena_catalog_no': a.mena_catalog_no,
                'scott_number': a.scott_number,
                'confidence': a.confidence,
                'score': round(a.score, 1),
                'signals': {k: round(v, 1) for k, v in a.signals.items()},
                'breakdown': a.breakdown,
                'boost_reasons': a.boost_reasons,
                'requires_review': a.requires_review
            }
            for a in assignments
        ],
        'unmatched': {
            'mena': [
                {
                    'catalog_no': u.catalog_no,
                    'denomination': u.denomination,
                    'color': u.color,
                    'reason': u.reason
                }
                for u in unmatched['unmatched_mena']
            ],
            'scott': [
                {
                    'catalog_no': u.catalog_no,
                    'denomination': u.denomination,
                    'color': u.color,
                    'reason': u.reason
                }
                for u in unmatched['unmatched_scott']
            ]
        },
        'statistics': statistics,
        'scoring_matrix': scoring_matrix  # Include for detailed analysis
    }
    
    # Print summary
    print("\n" + "="*80)
    print("MATCHING SUMMARY")
    print("="*80)
    print(f"Total Mena stamps: {statistics['total_mena_stamps']}")
    print(f"Successful matches: {statistics['total_assignments']}")
    print(f"  - High confidence: {statistics['high_confidence']}")
    print(f"  - Medium confidence: {statistics['medium_confidence']}")
    print(f"  - Low confidence: {statistics['low_confidence']}")
    print(f"Success rate: {statistics['success_rate']}%")
    print("="*80 + "\n")
    
    return result


# ============================================================================
# PRETTY PRINTING
# ============================================================================

def print_matching_results(result: Dict[str, Any]) -> None:
    """
    Pretty print the matching results
    
    Args:
        result: Result dictionary from match_mena_to_scott
    """
    print("\n" + "="*80)
    print("DETAILED MATCHING RESULTS")
    print("="*80)
    
    print(f"\nIssue: {result['issue_match']['mena_title']} ({result['issue_match']['mena_year']})")
    print(f"Mena ID: {result['issue_match']['mena_issue_id']}")
    print(f"Scott candidate pool: {result['issue_match']['candidate_pool_size']} stamps")
    
    print("\n" + "-"*80)
    print("ASSIGNMENTS")
    print("-"*80)
    
    for assignment in result['assignments']:
        print(f"\n✓ Mena #{assignment['mena_catalog_no']} → Scott #{assignment['scott_number']}")
        print(f"  Confidence: {assignment['confidence']} (Score: {assignment['score']}/100)")
        print(f"  Signals: {assignment['signals']}")
        print(f"  Breakdown: {assignment['breakdown']}")
        if assignment['boost_reasons']:
            print(f"  Boosts: {', '.join(assignment['boost_reasons'])}")
        if assignment['requires_review']:
            print(f"  ⚠️  REQUIRES MANUAL REVIEW")
    
    if result['unmatched']['mena']:
        print("\n" + "-"*80)
        print("UNMATCHED MENA STAMPS")
        print("-"*80)
        for u in result['unmatched']['mena']:
            print(f"  ✗ Mena #{u['catalog_no']}: {u['denomination']} {u['color']}")
            print(f"    Reason: {u['reason']}")
    
    if result['unmatched']['scott']:
        print("\n" + "-"*80)
        print("UNMATCHED SCOTT STAMPS")
        print("-"*80)
        for u in result['unmatched']['scott']:
            print(f"  ✗ Scott #{u['catalog_no']}: {u['denomination']} {u['color']}")
            print(f"    Reason: {u['reason']}")
    
    print("\n" + "="*80)
    print("STATISTICS")
    print("="*80)
    stats = result['statistics']
    print(f"Total Mena stamps: {stats['total_mena_stamps']}")
    print(f"Matched: {stats['total_assignments']} ({stats['success_rate']}%)")
    print(f"  High confidence: {stats['high_confidence']}")
    print(f"  Medium confidence: {stats['medium_confidence']}")
    print(f"  Low confidence: {stats['low_confidence']}")
    print(f"Unmatched Mena: {stats['unmatched_mena']}")
    print(f"Unmatched Scott: {stats['unmatched_scott']}")
    print("="*80 + "\n")


### Test

#### Get Mena Test Issue

In [None]:
# Ruta al archivo unido
PATH = Path("results/parsed_catalogues/mena_parse_results_ALL.json")

# Cargar
with PATH.open("r", encoding="utf-8") as f:
    items = json.load(f)

# Conteo
print(f"Total de items: {len(items)}")

# Iterar (imprime un resumen por cada elemento)
mena_issue = items[0]

In [None]:
mena_issue

#### Get Scott Stamps

In [None]:

# Ruta al archivo unido
PATH = Path("results/parsed_catalogues/scott_parse_results_ALL.json")

# Cargar
with PATH.open("r", encoding="utf-8") as f:
    items = json.load(f)

all_scott_stamps = items
# Conteo
print(f"Total de items: {len(all_scott_stamps)}")

#### Run the test

In [None]:
# Run matching
result = match_mena_to_scott(
    mena_issue=mena_issue,
    all_scott_stamps=all_scott_stamps,
    year_tolerance=2,
    min_score_threshold=30.0
)

# Print results
print_matching_results(result)