## Dolphin PDF Parser Philatelic RAG Ready

In [None]:
from IPython.display import Markdown, display, Image as IPImage
from pathlib import Path
import re
import os
import torch

from demo_page import *

# Import the refactored modules
from philatelic_patterns import  *

from philatelic_metadata_tests import *

# Import the new dolphin transformer module
from dolphin_transformer import transform_dolphin_to_oxcart_preserving_labels

from typing import Dict, Any, Optional, List
import json
from datetime import datetime

### Utils Functions

In [None]:
def show_markdown(ox: dict):
    md = ox.get("markdown","").strip()
    if not md:
        print("No hay markdown.")
        return
    
    # Get the base directory and figures directory
    base_dir = Path(os.getcwd())
    figures_dir = base_dir / "results" / "markdown" / "figures"
    
    # Split markdown into parts and process images separately
    parts = re.split(r'(!\[Figure\]\([^)]+\))', md)
    
    for part in parts:
        if part.startswith('![Figure]('):
            # Extract image path
            img_match = re.match(r'!\[Figure\]\(([^)]+)\)', part)
            if img_match:
                img_path = img_match.group(1)
                
                # Handle relative path starting with "figures/"
                if img_path.startswith("figures/"):
                    abs_path = figures_dir / img_path[8:]  # Remove "figures/" prefix
                else:
                    abs_path = figures_dir / img_path
                
                if abs_path.exists():
                    # Display image directly using IPython.display.Image
                    display(IPImage(filename=str(abs_path), width=400))
                else:
                    print(f"‚ùå Imagen no encontrada: {abs_path}")
        else:
            # Display regular markdown content
            if part.strip():
                display(Markdown(part))

def show_markdown_with_embedded_images(ox: dict):
    """Versi√≥n alternativa que convierte im√°genes a base64 para embedding"""
    import base64
    
    md = ox.get("markdown","").strip()
    if not md:
        print("No hay markdown.")
        return
    
    base_dir = Path(os.getcwd())
    figures_dir = base_dir / "results" / "markdown" / "figures"
    
    def embed_image(match):
        img_path = match.group(1)
        
        if img_path.startswith("figures/"):
            abs_path = figures_dir / img_path[8:]
        else:
            abs_path = figures_dir / img_path
        
        if abs_path.exists():
            try:
                with open(abs_path, "rb") as img_file:
                    img_data = base64.b64encode(img_file.read()).decode()
                    img_ext = abs_path.suffix.lower().replace('.', '')
                    if img_ext == 'jpg':
                        img_ext = 'jpeg'
                    return f"![Figure](data:image/{img_ext};base64,{img_data})"
            except Exception as e:
                return f"![Imagen error: {e}]({img_path})"
        else:
            return f"![Imagen no encontrada]({img_path})"
    
    # Replace image paths with base64 embedded images
    md_with_embedded = re.sub(r'!\[Figure\]\(([^)]+)\)', embed_image, md)
    
    display(Markdown(md_with_embedded))

### First Step --  Get the PDF Parsed from Dolphin

In [None]:
print(torch.cuda.is_available())  # Debe dar True
print(torch.version.cuda)         # Ej: '12.1'
print(torch.cuda.get_device_name(0))  # Debe decir 'NVIDIA GeForce RTX 3060 Laptop GPU'

In [None]:
# pdf file name
pdf_file_name = "OXCART30"

In [None]:

# Par√°metros equivalentes a los del comando
config_path = "./config/Dolphin.yaml"
input_path = "./pdfs/" + pdf_file_name + ".pdf"
save_dir = "./results"
max_batch_size = 4  # valor por defecto

# 1. Cargar config y modelo
config = OmegaConf.load(config_path)
#config.model.swin_args.img_size = [768, 768]
model = DOLPHIN(config)



# 3) Si la clase DOLPHIN no expone .to(), forzamos FP16 + no_grad desde fuera envolviendo su m√©todo .chat
# orig_chat = model.chat
# def chat_fp16(*args, **kwargs):
#     with torch.no_grad(), torch.cuda.amp.autocast(dtype=torch.float16):
#         return orig_chat(*args, **kwargs)
# model.chat = chat_fp16  # monkey-patch

# 2. Preparar carpeta de salida
setup_output_dirs(save_dir)

# 3. Procesar documento PDF
json_path, recognition_results = process_document(
    document_path=input_path,
    model=model,
    save_dir=save_dir,
    max_batch_size=max_batch_size
)

print(f"‚úÖ Resultados guardados en: {json_path}")
print(f"üìÑ Total elementos: {sum(len(p['elements']) for p in recognition_results)}")

### View the Dolphin Original PDF Parsed in MarkDown

In [None]:
md_file = Path(f"./results/markdown/{pdf_file_name}.md")  # pon aqu√≠ el que quieras abrir

# Leer el markdown
md_text = md_file.read_text(encoding="utf-8")

# Mostrar como markdown en Jupyter
display(Markdown(md_text))

### Getting the Chunks of the PDF -- Philatelic RAG Ready Chunks

In [None]:
ox = transform_dolphin_to_oxcart_preserving_labels(
    recognition_results,
    doc_id=pdf_file_name,
    # page_dims_provider=lambda p: Image.open(f"./results/pages/page_{p:03d}.png").size,
    para_max_chars=1000,
    table_row_block_size=None  # Disabled to preserve table integrity and quality
)

In [None]:
# Enrich with philatelic metadata using the imported function
ox = enrich_all_chunks_advanced_philatelic(ox)

# Save using the imported function
save_json(ox, f"./results/parsed_jsons/{pdf_file_name}_philatelic.json")

In [None]:
#ox['chunks'][38]

In [None]:
# Show the Markdown of the PDF proccesed with Philately Logic for RAG
show_markdown_with_embedded_images(ox)

#### Verificaci√≥n de Entidades de Filatelia

Verificar que los chunks fueron enriquecidos correctamente con entidades filat√©licas

In [None]:
# Verify philatelic entities using the imported function
philatelic_chunks = show_philatelic_entities(ox, max_examples=8)

In [None]:
# Detailed analysis using the imported function
analyze_philatelic_entities(ox)

In [None]:
# Show catalog examples using the imported function
catalog_examples = show_catalog_examples_by_system(ox)

#### Comparison: Original Dolphin vs Philatelic Enhanced

In [None]:
# Import the quality control system
from dolphin_quality_control import DolphinQualityControl

In [None]:
# Initialize quality control
qc = DolphinQualityControl()

original_json_path = Path(f"./results/recognition_json/{pdf_file_name}.json")
with open(original_json_path, 'r', encoding='utf-8') as f:
    original_data = json.load(f)
    
# Compare original vs philatelic versions
comparison_results = qc.compare_versions_generic(
    original_data=recognition_results,
    philatelic_data=ox,
    doc_id=pdf_file_name
)

print(f"üìä Comparison Results for {pdf_file_name}:")
print(f"Original elements: {comparison_results['original']['total_elements']}")
print(f"Philatelic chunks: {comparison_results['philatelic']['total_chunks']}")
print(f"Oversized chunks: {len(comparison_results['philatelic']['oversized_chunks'])}")

# Show improvement summary
summary = comparison_results['comparison_summary']
print(f"\nüìà Quality Metrics:")
print(f"Element to chunk ratio: {summary['element_to_chunk_ratio']:.2f}")
print(f"Average text length: Original={summary['text_length_stats']['original_avg']:.0f}, Philatelic={summary['text_length_stats']['philatelic_avg']:.0f}")
print(f"Max text length: Original={summary['text_length_stats']['original_max']}, Philatelic={summary['text_length_stats']['philatelic_max']}")

# Show any oversized chunks
if comparison_results['philatelic']['oversized_chunks']:
    print(f"\n‚ö†Ô∏è Oversized chunks detected:")
    for chunk in comparison_results['philatelic']['oversized_chunks'][:5]:  # Show first 5
        print(f"  - {chunk['chunk_id']} ({chunk['chunk_type']}): {chunk['text_length']} chars")
else:
    print(f"\n‚úÖ No oversized chunks detected")

In [None]:
# Generate detailed comparison report
report = qc.generate_detailed_report_generic(
    original_data=recognition_results,
    philatelic_data=ox,
    doc_id=pdf_file_name
)

# Display the report
from IPython.display import Markdown
display(Markdown(report))