# The CR Knowledge Graph of Philately

In [1]:
import json
import os
from pathlib import Path
from typing import List, Dict, Any, Tuple
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field

In [2]:
# Define Pydantic models for structured output
class StampNode(BaseModel):
    id: str = Field(description="Unique identifier for the stamp")
    scott_number: str = Field(default="", description="Scott catalog number")
    michel_number: str = Field(default="", description="Michel catalog number")
    denomination: str = Field(description="Denomination value with currency")
    color: str = Field(description="Primary color of the stamp")
    color_variants: List[str] = Field(default_factory=list, description="Color variations")
    description: str = Field(description="Description of the stamp design")
    issue_date: str = Field(default="", description="Issue date in YYYY-MM-DD format")
    printer: str = Field(default="", description="Printing company")
    printing_method: str = Field(default="", description="Printing technique used")
    perforation: str = Field(default="", description="Perforation measurement")
    quantity_issued: int = Field(default=0, description="Number of stamps issued")
    status: str = Field(default="Issued", description="Status: Issued, Unissued, Withdrawn")

class VarietyNode(BaseModel):
    id: str = Field(description="Unique identifier for the variety")
    type: str = Field(description="Type of variety (e.g., Color variety, Perforation)")
    description: str = Field(description="Description of the variety")
    rarity: str = Field(default="", description="Rarity level")
    parent_stamp_id: str = Field(description="ID of the parent stamp")

class ProofNode(BaseModel):
    id: str = Field(description="Unique identifier for the proof")
    type: str = Field(description="Type of proof (Die proof, Plate proof)")
    color: str = Field(description="Color of the proof")
    paper_type: str = Field(default="", description="Type of paper used")
    description: str = Field(description="Description of the proof")
    rarity: str = Field(default="", description="Rarity level")
    parent_stamp_id: str = Field(description="ID of the parent stamp")

class SpecimenNode(BaseModel):
    id: str = Field(description="Unique identifier for the specimen")
    overprint_text: str = Field(description="Text of the overprint")
    description: str = Field(description="Description of the specimen")
    parent_stamp_id: str = Field(description="ID of the parent stamp")

class PhilatelicExtractor:
    def __init__(self, api_key: str, model: str = "gpt-4", context_window: Tuple[int, int] = (-3, 1), 
                 skip_elements: List[str] = None):
        self.llm = ChatOpenAI(
            model=model, 
            api_key=api_key, 
            temperature=0.1,  # obligatorio para gpt-5-nano
            timeout=120.0,
            #max_completion_tokens=2500,
            model_kwargs={
                # "verbosity": "high",
                # "reasoning_effort" : "medium",
                "response_format": {"type": "json_object"}  # Force JSON output
            })
        self.context_window = context_window  # (before, after)
        
        # Configure which element types to skip during processing
        # Headers can provide valuable context, so they're not skipped by default
        self.skip_elements = skip_elements or ["footer", "foot"]
        
        # Philatelic abbreviations and context
        self.philatelic_context = """
        PHILATELIC ABBREVIATIONS AND TERMS:
        - S = Specimen (overprinted stamps)
        - DP = Die Proof (master die impressions)
        - PP = Plate Proof (printing plate impressions)
        - P = Peso (currency)
        - c = centavo/centimo (currency subdivision)
        - imperf = imperforate (no perforations)
        - perf = perforate (with perforations)
        - op = overprint
        - FDC = First Day Cover
        - CDS = Circular Date Stamp
        
        CATALOG NUMBERING:
        - Scott: Primary US catalog system (e.g., S21, S22)
        - Michel: German catalog system
        - Yvert: French catalog system
        - Suffix letters (a, b, c) indicate varieties or color shades
        
        RARITY TERMS:
        - Common: Easily available
        - Scarce: Limited availability
        - Rare: Difficult to find
        - Very Rare: Extremely limited
        - Unique: Only one known
        
        COSTA RICA SPECIFIC:
        - Real/Reales: Currency used 1863-1896
        - Peso: Currency from 1896
        - Waterlow & Sons: Major printer for Costa Rica stamps
        - American Bank Note Company: Early printer
        """
    
    def get_element_with_context(self, elements: List[Dict], current_idx: int) -> Dict[str, Any]:
        """
        Get the current element along with context window elements.
        
        Args:
            elements: List of page elements sorted by reading_order
            current_idx: Index of the current element to analyze
            
        Returns:
            Dict with current element and context elements before/after
            
        Context window format: (before_count, after_count)
        - before_count: negative number indicating how many elements before current
        - after_count: positive number indicating how many elements after current
        Example: (-3, 1) means 3 elements before and 1 element after
        """
        before_count, after_count = self.context_window
        
        # Get context elements
        context_before = []
        context_after = []
        
        # Get elements before current (before_count is negative, so we add it to get earlier indices)
        start_idx = max(0, current_idx + before_count)  # before_count is negative
        for i in range(start_idx, current_idx):
            elem = elements[i]
            context_before.append({
                "position": f"Context [{i - current_idx}]",  # Will be negative (e.g., -3, -2, -1)
                "label": elem.get("label", "unknown"),
                "text": elem.get("text", ""),
                "reading_order": elem.get("reading_order", -1)
            })
        
        # Current element
        current_element = elements[current_idx]
        
        # Get elements after current
        end_idx = min(len(elements), current_idx + after_count + 1)
        for i in range(current_idx + 1, end_idx):
            elem = elements[i]
            context_after.append({
                "position": f"Context [+{i - current_idx}]",  # Will be positive (e.g., +1, +2, +3)
                "label": elem.get("label", "unknown"),
                "text": elem.get("text", ""),
                "reading_order": elem.get("reading_order", -1)
            })
        
        return {
            "context_before": context_before,
            "current": {
                "position": "CURRENT (Main element to analyze)",
                "label": current_element.get("label", "unknown"),
                "text": current_element.get("text", ""),
                "reading_order": current_element.get("reading_order", -1),
                "bbox": current_element.get("bbox", [])
            },
            "context_after": context_after
        }
    
    def format_context_for_prompt(self, context_data: Dict[str, Any]) -> str:
        """Format the context data into a readable string for the prompt"""
        sections = []
        
        # Add context before
        if context_data["context_before"]:
            sections.append("=== CONTEXT BEFORE (for reference) ===")
            for ctx in context_data["context_before"]:
                sections.append(f"\n[{ctx['position']}] - {ctx['label'].upper()}")
                sections.append(f"{ctx['text'][:500]}")  # Limit length to avoid token overflow
        
        # Add current element (the main focus)
        sections.append("\n" + "="*60)
        sections.append("=== CURRENT ELEMENT (MAIN FOCUS - EXTRACT FROM THIS) ===")
        sections.append("="*60)
        current = context_data["current"]
        sections.append(f"[{current['position']}] - {current['label'].upper()}")
        sections.append(f"\n{current['text']}\n")
        
        # Add context after
        if context_data["context_after"]:
            sections.append("=== CONTEXT AFTER (for reference) ===")
            for ctx in context_data["context_after"]:
                sections.append(f"\n[{ctx['position']}] - {ctx['label'].upper()}")
                sections.append(f"{ctx['text'][:500]}")
        
        return "\n".join(sections)
    
    def create_extraction_prompt(self, context_text: str, element_type: str, page_num: int) -> ChatPromptTemplate:
        system_message = f"""You are an expert philatelist specializing in Costa Rican postal history and stamp catalogs.
        Your task is to extract structured information from catalog entries to build a knowledge graph.
        
        IMPORTANT: You must respond with valid JSON only. Do not use markdown formatting or code blocks.
        
        {self.philatelic_context}
        
        IMPORTANT INSTRUCTIONS:
        - You will receive a CURRENT ELEMENT (the main text to analyze) surrounded by CONTEXT elements
        - Use the context elements to better understand the current element
        - Extract information ONLY from the CURRENT ELEMENT (marked as "MAIN FOCUS")
        - The context is provided to help you understand:
          * What issue or series is being discussed
          * Continuation of information from previous elements
          * Printer information or general notes that apply to multiple stamps
          * Section headers or organizational structure
        
        The current element type is: {element_type}
        Page number: {page_num}
        
        Extract all relevant philatelic information including:
        - Catalog numbers (Scott, Michel, Yvert)
        - Denominations and currencies
        - Colors and their variants
        - Descriptions and designs
        - Issue dates
        - Printers and printing methods
        - Perforation measurements
        - Quantities issued
        - Varieties, errors, proofs, and specimens
        - Any overprints or special markings
        
        Be precise with catalog numbers and maintain the exact format shown in the text.
        Use context to infer missing information (like printer, issue series) but only extract from the CURRENT element.
        """
        
        user_message = """Analyze the following catalog text WITH CONTEXT and extract all philatelic nodes.

        {context_text}
        
        Return your analysis as valid JSON with this exact structure:
        {{
            "stamps": [array of stamp objects with all available details],
            "varieties": [array of variety objects],
            "proofs": [array of proof objects],
            "specimens": [array of specimen objects],
            "notes": "any additional context or observations about the current element",
            "inferred_from_context": {{
                "issue_name": "if identifiable from context",
                "printer": "if mentioned in context",
                "issue_date": "if mentioned in context"
            }}
        }}
        
        REMEMBER: 
        1. Extract data from the CURRENT ELEMENT only, but use context to enrich and validate your extraction
        2. For each stamp, include all available details with proper field names
        3. Create separate entries for varieties, proofs, and specimens that reference their parent stamp
        4. Return valid JSON only - no markdown, no code blocks, no explanatory text
        """
        
        return ChatPromptTemplate.from_messages([
            ("system", system_message),
            ("user", user_message)
        ])
    
    def process_catalog_entry(self, context_data: Dict[str, Any], page_num: int) -> Dict[str, List[Dict]]:
        """Process a single catalog entry with context and extract philatelic nodes"""
        current_element = context_data["current"]
        text = current_element.get("text", "")
        element_type = current_element.get("label", "unknown")
        
        # Skip if current element has no meaningful text
        if not text or len(text.strip()) < 10:
            return {"stamps": [], "varieties": [], "proofs": [], "specimens": []}
        
        # Format the context for the prompt
        context_text = self.format_context_for_prompt(context_data)
        
        prompt = self.create_extraction_prompt(context_text, element_type, page_num)
        
        try:
            response = self.llm.invoke(prompt.format(context_text=context_text))
            
            # Parse the JSON response (should be clean JSON now)
            result = json.loads(response.content)
            
            # Validate result structure
            expected_keys = {"stamps", "varieties", "proofs", "specimens"}
            if not isinstance(result, dict):
                print(f"    Warning: Response is not a dictionary on page {page_num}")
                return {"stamps": [], "varieties": [], "proofs": [], "specimens": []}
            
            # Ensure all expected keys exist
            for key in expected_keys:
                if key not in result:
                    result[key] = []
            
            # Add metadata about the extraction
            for stamp in result.get("stamps", []):
                if isinstance(stamp, dict):
                    stamp["_metadata"] = {
                        "page": page_num,
                        "element_type": element_type,
                        "reading_order": current_element.get("reading_order", -1)
                    }
            
            # Show extraction results for debugging
            extracted_count = len(result.get("stamps", [])) + len(result.get("varieties", [])) + len(result.get("proofs", [])) + len(result.get("specimens", []))
            if extracted_count > 0:
                print(f"    ✓ Extracted: {len(result.get('stamps', []))} stamps, {len(result.get('varieties', []))} varieties, {len(result.get('proofs', []))} proofs, {len(result.get('specimens', []))} specimens")
            
            return result
        
        except json.JSONDecodeError as e:
            print(f"    JSON parsing error on page {page_num}, element {element_type}: {e}")
            print(f"    Element text: {text[:100]}...")
            print(f"    Response content: {response.content[:300]}...")
            return {"stamps": [], "varieties": [], "proofs": [], "specimens": []}
        except Exception as e:
            print(f"    Error processing element on page {page_num}, element {element_type}: {e}")
            print(f"    Element text: {text[:100]}...")
            return {"stamps": [], "varieties": [], "proofs": [], "specimens": []}
    
    def process_json_file(self, json_path: str) -> Dict[str, List[Dict]]:
        """Process entire JSON catalog file with context windows"""
        with open(json_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        # Validate JSON structure
        if "pages" not in data:
            raise ValueError(f"Invalid JSON structure: missing 'pages' key. Found keys: {list(data.keys())}")
        
        print(f"Loaded catalog with {len(data['pages'])} pages from {data.get('source_file', 'unknown source')}")
        
        all_stamps = []
        all_varieties = []
        all_proofs = []
        all_specimens = []
        
        # Process each page - Fixed: iterate over data["pages"] instead of data
        for page in data["pages"]:
            page_num = page.get("page_number", 0)
            print(f"\nProcessing page {page_num}...")
            
            elements = page.get("elements", [])
            
            # Sort elements by reading_order if available
            elements.sort(key=lambda x: x.get("reading_order", 999))
            
            # Process each element with context window
            for idx, element in enumerate(elements):
                element_type = element.get("label", "unknown")
                
                # Skip configured element types that typically don't contain stamp data
                if element_type in self.skip_elements:
                    print(f"  Skipping {element_type} at index {idx}")
                    continue
                
                print(f"  Processing element {idx+1}/{len(elements)} ({element_type})...")
                
                # Get element with context
                context_data = self.get_element_with_context(elements, idx)
                
                # Process with context
                result = self.process_catalog_entry(context_data, page_num)
                
                all_stamps.extend(result.get("stamps", []))
                all_varieties.extend(result.get("varieties", []))
                all_proofs.extend(result.get("proofs", []))
                all_specimens.extend(result.get("specimens", []))
                
                # Display any inferred context information
                if result.get("inferred_from_context"):
                    print(f"    → Inferred from context: {result['inferred_from_context']}")
        
        return {
            "stamps": all_stamps,
            "varieties": all_varieties,
            "proofs": all_proofs,
            "specimens": all_specimens
        }
    
    def inspect_json_structure(self, json_path: str, max_pages: int = 5) -> None:
        """Inspect the structure of the JSON file to understand its format"""
        with open(json_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        print(f"JSON Structure Inspection:")
        print(f"Root keys: {list(data.keys())}")
        print(f"Total pages: {data.get('total_pages', 'unknown')}")
        print(f"Source file: {data.get('source_file', 'unknown')}")
        
        if "pages" in data:
            print(f"Pages available: {len(data['pages'])}")
            
            # Show sample pages
            for i, page in enumerate(data["pages"][:max_pages]):
                page_num = page.get("page_number", i)
                elements = page.get("elements", [])
                print(f"\nPage {page_num}:")
                print(f"  Elements: {len(elements)}")
                
                # Show element types
                element_types = {}
                for elem in elements:
                    elem_type = elem.get("label", "unknown")
                    element_types[elem_type] = element_types.get(elem_type, 0) + 1
                
                print(f"  Element types: {dict(element_types)}")
                
                # Show sample text from first few elements
                for j, elem in enumerate(elements[:3]):
                    text = elem.get("text", "")[:100]
                    print(f"    Element {j+1} ({elem.get('label', 'unknown')}): {text}...")
    
    def process_sample_pages(self, json_path: str, start_page: int = 1, num_pages: int = 3) -> Dict[str, List[Dict]]:
        """Process only a sample of pages for testing purposes"""
        with open(json_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        if "pages" not in data:
            raise ValueError(f"Invalid JSON structure: missing 'pages' key")
        
        # Filter to sample pages
        sample_pages = [p for p in data["pages"] if start_page <= p.get("page_number", 0) < start_page + num_pages]
        
        print(f"Processing sample: {len(sample_pages)} pages (from page {start_page})")
        
        all_stamps = []
        all_varieties = []
        all_proofs = []
        all_specimens = []
        
        # Process sample pages
        for page in sample_pages:
            page_num = page.get("page_number", 0)
            print(f"\nProcessing sample page {page_num}...")
            
            elements = page.get("elements", [])
            elements.sort(key=lambda x: x.get("reading_order", 999))
            
            for idx, element in enumerate(elements):
                element_type = element.get("label", "unknown")
                
                if element_type in self.skip_elements:
                    print(f"  Skipping {element_type} at index {idx}")
                    continue
                
                print(f"  Processing element {idx+1}/{len(elements)} ({element_type})...")
                
                context_data = self.get_element_with_context(elements, idx)
                result = self.process_catalog_entry(context_data, page_num)
                
                all_stamps.extend(result.get("stamps", []))
                all_varieties.extend(result.get("varieties", []))
                all_proofs.extend(result.get("proofs", []))
                all_specimens.extend(result.get("specimens", []))
                
                if result.get("inferred_from_context"):
                    print(f"    → Inferred from context: {result['inferred_from_context']}")
        
        return {
            "stamps": all_stamps,
            "varieties": all_varieties,
            "proofs": all_proofs,
            "specimens": all_specimens
        }
    
    def save_knowledge_graph(self, data: Dict, output_path: str):
        """Save extracted knowledge graph to JSON"""
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=2, ensure_ascii=False)
        
        print(f"\n{'='*60}")
        print(f"Knowledge graph saved to: {output_path}")
        print(f"{'='*60}")
        print(f"Total stamps: {len(data['stamps'])}")
        print(f"Total varieties: {len(data['varieties'])}")
        print(f"Total proofs: {len(data['proofs'])}")
        print(f"Total specimens: {len(data['specimens'])}")
        print(f"{'='*60}")


## Configuration

In [3]:
# Configuration
API_KEY = os.getenv("OPENAI_API_KEY")  # Set your API key as environment variable
INPUT_PATH = "./results/recognition_json/Mena 2018 CRPC .json"
OUTPUT_PATH = "./results/knowledge_graph/mena_2018_crpc_kg.json"

# Context window: (-3, +1) means 3 elements before and 1 element after
CONTEXT_WINDOW = (-3, 1)

# Initialize extractor
print(f"Initializing Philatelic Knowledge Graph Extractor")
print(f"Context window: {CONTEXT_WINDOW[0]} before, +{CONTEXT_WINDOW[1]} after")

extractor = PhilatelicExtractor(
    api_key=API_KEY,
    model="gpt-4o-mini",  # Use gpt-5-nano for better accuracy
    context_window=CONTEXT_WINDOW
)

Initializing Philatelic Knowledge Graph Extractor
Context window: -3 before, +1 after


## Test

In [4]:
# Test the JSON structure and improved processing
print("=== TESTING IMPROVED FUNCTIONALITY ===")

# First, inspect the JSON structure
print("1. Inspecting JSON structure...")
extractor.inspect_json_structure(INPUT_PATH, max_pages=3)

print("\n" + "="*60)
print("2. Processing sample pages for testing...")

# Process a small sample first (pages 30-31 to get catalog content with actual stamp data)
sample_result = extractor.process_sample_pages(INPUT_PATH, start_page=30, num_pages=2)

print(f"\nSample Results:")
print(f"Stamps found: {len(sample_result['stamps'])}")
print(f"Varieties found: {len(sample_result['varieties'])}")
print(f"Proofs found: {len(sample_result['proofs'])}")
print(f"Specimens found: {len(sample_result['specimens'])}")

if sample_result["stamps"]:
    print(f"\nFirst stamp found:")
    print(json.dumps(sample_result["stamps"][0], indent=2))
    
if sample_result["proofs"]:
    print(f"\nFirst proof found:")
    print(json.dumps(sample_result["proofs"][0], indent=2))

print(f"\n{'='*60}")
print("3. Testing specific element types...")

# Show what types of elements we're finding
if sample_result["stamps"] or sample_result["varieties"] or sample_result["proofs"] or sample_result["specimens"]:
    print("✓ JSON parsing is now working correctly!")
    print("✓ Elements are being extracted successfully!")
else:
    print("⚠ Still not extracting elements - may need context strategy improvements")

=== TESTING IMPROVED FUNCTIONALITY ===
1. Inspecting JSON structure...
JSON Structure Inspection:
Root keys: ['source_file', 'total_pages', 'pages']
Total pages: 315
Source file: pdfs\Catalogues\Mena 2018 CRPC .pdf
Pages available: 315

Page 1:
  Elements: 8
  Element types: {'header': 1, 'para': 6, 'fig': 1}
    Element 1 (header): SBN 0-9645247-8-3...
    Element 2 (para): Hector R. Mena...
    Element 3 (para): COSTA RICA...

Page 2:
  Elements: 7
  Element types: {'para': 6, 'fig': 1}
    Element 1 (para): Hector R. Mena...
    Element 2 (para): COSTA RICA...
    Element 3 (para): POSTAL CATALOGUE...

Page 3:
  Elements: 8
  Element types: {'para': 8}
    Element 1 (para): SOCORICO...
    Element 2 (para): Society for Costa Rica Collectors
Postal Box 14831...
    Element 3 (para): Baton Rouge, Louisiana 70808...

2. Processing sample pages for testing...
Processing sample: 2 pages (from page 30)

Processing sample page 30...
  Processing element 1/30 (header)...
  Processing elemen

In [5]:
sample_result["stamps"]

[{'catalog_numbers': {'Scott': None, 'Michel': None, 'Yvert': None},
  'denomination': 'unknown',
  'currency': None,
  'colors': ['blue', 'black'],
  'description': 'tete beche vertical pairs',
  'issue_date': None,
  'printer': None,
  'perforation': 'double perf vertics',
  'quantities_issued': None,
  'overprints': None,
  '_metadata': {'page': 30, 'element_type': 'fig', 'reading_order': 7}},
 {'catalog_number': 'RFC 53',
  'issue_date': 'July 24, 1921',
  'printer': 'American Bank Note Company',
  'denomination': None,
  'colors': None,
  'description': 'Engraved',
  'perforation': 'Perf 12',
  'quantity_issued': None,
  'design': None,
  'notes': None,
  '_metadata': {'page': 30, 'element_type': 'para', 'reading_order': 9}},
 {'catalog_number': 'DP99',
  'denomination': '15 c',
  'color': 'black',
  'quantity_issued': '#34009',
  '_metadata': {'page': 30, 'element_type': 'para', 'reading_order': 11}},
 {'catalog_number': 'PP99',
  'denomination': '15c',
  'color': 'light gray vio

## Process Catalog

In [None]:
# Process the catalog
print(f"\nProcessing catalog: {INPUT_PATH}")
knowledge_graph = extractor.process_json_file(INPUT_PATH)

# Save results
extractor.save_knowledge_graph(knowledge_graph, OUTPUT_PATH)

# Display sample results
print("\n" + "="*60)
print("SAMPLE EXTRACTED NODES:")
print("="*60)

if knowledge_graph["stamps"]:
    print(f"\nSample Stamp:")
    print(json.dumps(knowledge_graph["stamps"][0], indent=2))

if knowledge_graph["proofs"]:
    print(f"\nSample Proof:")
    print(json.dumps(knowledge_graph["proofs"][0], indent=2))