# Benchmarking / Metrics Calculation

**Task**: Compare parser JSONs against ground truth labels to compute precision, recall, F1-score per field.

**Requirements**:
- Load ground truth labels
- Load parser JSONs
- Compare each field across documents (aligned with invoice_schema_v1_reset.json)
- Compute precision, recall, F1-score per field
- Identify fields with frequent misses
- Track template-level coverage (ensure all 3 templates are represented)
- Save results in versioned output (benchmark_v0.3.json or .csv)

**Schema Alignment**: This benchmark evaluates all fields from the invoice schema including required fields (invoice_number, due_date, patient_name, subtotal_amount, invoice_date, total_amount, line_items) and optional fields (patient_id, patient_age, patient_address, patient_phone, patient_email, admission_date, discharge_date, discount_amount, provider_name, bed_id).


## 1. Setup and Imports


In [None]:
import pandas as pd
import json
import numpy as np
from pathlib import Path
from typing import Dict, List, Any, Tuple
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

print("Imports ran successful")


## 2. Configuration


In [None]:
# Configuration
CONFIG = {
    "version": "v0.3",
    "ground_truth_csv": "bench/data/ground_truth/invoice_fields.csv",
    "minna_parser_dir": "path/to/minna/parser/jsons",  # TODO: Update when available
    "output_dir": "bench/outputs",
    "test_documents": [
        "invoice_T1_gen1.pdf",  # Template 1
        "invoice_T2_gen1.pdf",  # Template 2  
        "invoice_T3_gen1.pdf"   # Template 3
    ],
    "fields_to_evaluate": [
        # Required fields from schema
        "invoice_number",
        "due_date", 
        "patient_name",
        "subtotal_amount",
        "invoice_date",
        "total_amount",
        "line_items",
        
        # Optional fields from schema
        "patient_id",
        "patient_age",
        "patient_address", 
        "patient_phone",
        "patient_email",
        "admission_date",
        "discharge_date",
        "discount_amount",
        "provider_name",
        "bed_id"
    ]
}

print(f"Configuration loaded - Version: {CONFIG['version']}")
print(f"Fields to evaluate: {len(CONFIG['fields_to_evaluate'])} fields")


## 3. Load Ground Truth Labels (Dean & Matthew D.)


In [None]:
# Load ground truth CSV
gt_df = pd.read_csv(CONFIG['ground_truth_csv'])
print(f"Loaded ground truth for {len(gt_df)} documents")
print(f"Columns: {list(gt_df.columns)}")

# Convert to dictionary for easy lookup
ground_truth = {}
for _, row in gt_df.iterrows():
    filename = row['filename']
    ground_truth[filename] = row.to_dict()
    
    # Parse line_items JSON string if present
    if 'line_items' in row and pd.notna(row['line_items']):
        try:
            ground_truth[filename]['line_items'] = json.loads(row['line_items'])
        except (json.JSONDecodeError, TypeError):
            ground_truth[filename]['line_items'] = []

print(f"Ground truth loaded for documents: {list(ground_truth.keys())}")

# Display sample ground truth
sample_doc = list(ground_truth.keys())[0]
print(f"\nSample ground truth for {sample_doc}:")
for field in CONFIG['fields_to_evaluate']:
    if field in ground_truth[sample_doc]:
        print(f"  {field}: {ground_truth[sample_doc][field]}")


## 4. Load Parser JSONs

**TODO**: Update the path and loading logic when parser JSONs are available


In [None]:
# TODO: Replace this section when Minna's parser JSONs are available
# Expected structure: JSON files with same naming as PDFs but .json extension

minna_parser_results = {}

# Placeholder: Create dummy parser results aligned with schema
print("Using placeholder parser results - Replace with actual JSONs")

for doc_name in CONFIG['test_documents']:
    # TODO: Replace with actual loading logic:
    # json_path = Path(CONFIG['minna_parser_dir']) / f"{Path(doc_name).stem}.json"
    # with open(json_path, 'r') as f:
    #     parser_results[doc_name] = json.load(f)
    
    # Placeholder data aligned with invoice_schema_v1_reset.json
    parser_results[doc_name] = {
        # Required fields
        "invoice_number": "PLACEHOLDER_INV",
        "due_date": "2020-02-01",
        "patient_name": "John Doe",
        "subtotal_amount": 100.00,
        "invoice_date": "2020-01-01",
        "total_amount": 120.00,
        "line_items": [
            {
                "description": "Placeholder Service",
                "code": "PLACEHOLDER",
                "amount": 100.00
            }
        ],
        
        # Optional fields
        "patient_id": "PLACEHOLDER_MRN",
        "patient_age": 35,
        "patient_address": "123 Main St, City, State 12345",
        "patient_phone": "+1-555-0123",
        "patient_email": "john.doe@email.com",
        "admission_date": "2020-01-01",
        "discharge_date": "2020-01-02",
        "discount_amount": 0.00,
        "provider_name": "Dr. Jane Smith",
        "bed_id": "BED001"
    }

print(f"Parser results loaded for {len(parser_results)} documents")
print(f"Documents: {list(parser_results.keys())}")

# Display sample parser result
sample_doc = list(parser_results.keys())[0]
print(f"\nSample parser result for {sample_doc}:")
for field in CONFIG['fields_to_evaluate']:
    if field in parser_results[sample_doc]:
        print(f"  {field}: {parser_results[sample_doc][field]}")


## 5. Field Comparison Functions


In [None]:
def normalize_value(value):
    """Normalize values for comparison"""
    if pd.isna(value) or value is None:
        return None
    if isinstance(value, str):
        return value.strip().lower()
    return str(value).strip().lower()

def compare_scalar_field(gt_value, parser_value, field_name):
    """Compare scalar fields (invoice_number, patient_id, etc.)"""
    gt_norm = normalize_value(gt_value)
    parser_norm = normalize_value(parser_value)
    
    # Handle missing values
    if gt_norm is None and parser_norm is None:
        return True, "both_missing"
    if gt_norm is None:
        return False, "gt_missing"
    if parser_norm is None:
        return False, "parser_missing"
    
    # Exact match
    if gt_norm == parser_norm:
        return True, "exact_match"
    
    # Special handling for numeric fields
    numeric_fields = ['subtotal_amount', 'total_amount', 'discount_amount', 'patient_age']
    if field_name in numeric_fields:
        try:
            # Handle both string and numeric inputs
            gt_num = float(str(gt_norm).replace('$', '').replace(',', ''))
            parser_num = float(str(parser_norm).replace('$', '').replace(',', ''))
            # Allow small tolerance for floating point
            if abs(gt_num - parser_num) < 0.01:
                return True, "numeric_match"
        except (ValueError, AttributeError):
            pass
    
    # Special handling for date fields
    date_fields = ['invoice_date', 'due_date', 'admission_date', 'discharge_date']
    if field_name in date_fields:
        try:
            # Normalize date formats (basic comparison)
            gt_date = str(gt_norm).replace('-', '').replace('/', '')
            parser_date = str(parser_norm).replace('-', '').replace('/', '')
            if gt_date == parser_date:
                return True, "date_match"
        except (ValueError, AttributeError):
            pass
    
    return False, "mismatch"

def compare_line_items(gt_items, parser_items):
    """Compare line items arrays"""
    if not gt_items and not parser_items:
        return True, "both_empty"
    if not gt_items:
        return False, "gt_empty"
    if not parser_items:
        return False, "parser_empty"
    
    # Convert to comparable format
    gt_normalized = []
    for item in gt_items:
        if isinstance(item, dict):
            # Handle numeric amount properly
            amount = item.get('amount', '')
            try:
                amount = float(str(amount).replace('$', '').replace(',', ''))
            except (ValueError, TypeError):
                amount = 0.0
                
            gt_normalized.append({
                'code': normalize_value(item.get('code', '')),
                'description': normalize_value(item.get('description', '')),
                'amount': amount
            })
    
    parser_normalized = []
    for item in parser_items:
        if isinstance(item, dict):
            # Handle numeric amount properly
            amount = item.get('amount', '')
            try:
                amount = float(str(amount).replace('$', '').replace(',', ''))
            except (ValueError, TypeError):
                amount = 0.0
                
            parser_normalized.append({
                'code': normalize_value(item.get('code', '')),
                'description': normalize_value(item.get('description', '')),
                'amount': amount
            })
    
    # Compare line items with tolerance for amounts
    if len(gt_normalized) == len(parser_normalized):
        matches = 0
        for gt_item in gt_normalized:
            for parser_item in parser_normalized:
                # Compare description and amount (with tolerance)
                desc_match = gt_item['description'] == parser_item['description']
                amount_match = abs(gt_item['amount'] - parser_item['amount']) < 0.01
                code_match = gt_item['code'] == parser_item['code']
                
                if desc_match and amount_match and code_match:
                    matches += 1
                    break
        
        if matches == len(gt_normalized):
            return True, "exact_match"
        else:
            return False, f"partial_match_{matches}/{len(gt_normalized)}"
    
    return False, f"length_mismatch_{len(gt_normalized)}_vs_{len(parser_normalized)}"

print("Field comparison functions defined")


## 6. Compute Precision, Recall, F1-Score per Field


In [None]:
def compute_field_metrics(field_name, ground_truth, parser_results, documents):
    """Compute precision, recall, F1-score for a specific field"""
    
    results = {
        'field': field_name,
        'total_documents': len(documents),
        'gt_present': 0,
        'parser_present': 0,
        'both_present': 0,
        'matches': 0,
        'document_results': []
    }
    
    for doc_name in documents:
        if doc_name not in ground_truth or doc_name not in parser_results:
            continue
            
        gt_value = ground_truth[doc_name].get(field_name)
        parser_value = parser_results[doc_name].get(field_name)
        
        # Check presence
        gt_has_value = gt_value is not None and pd.notna(gt_value) and str(gt_value).strip() != ''
        parser_has_value = parser_value is not None and pd.notna(parser_value) and str(parser_value).strip() != ''
        
        if gt_has_value:
            results['gt_present'] += 1
        if parser_has_value:
            results['parser_present'] += 1
        if gt_has_value and parser_has_value:
            results['both_present'] += 1
        
        # Compare values
        if field_name == 'line_items':
            is_match, match_type = compare_line_items(gt_value, parser_value)
        else:
            is_match, match_type = compare_scalar_field(gt_value, parser_value, field_name)
        
        if is_match:
            results['matches'] += 1
        
        results['document_results'].append({
            'document': doc_name,
            'gt_value': gt_value,
            'parser_value': parser_value,
            'gt_present': gt_has_value,
            'parser_present': parser_has_value,
            'match': is_match,
            'match_type': match_type
        })
    
    # Calculate metrics
    if results['parser_present'] > 0:
        results['precision'] = results['matches'] / results['parser_present']
    else:
        results['precision'] = 0.0
    
    if results['gt_present'] > 0:
        results['recall'] = results['matches'] / results['gt_present']
    else:
        results['recall'] = 0.0
    
    if results['precision'] + results['recall'] > 0:
        results['f1_score'] = 2 * (results['precision'] * results['recall']) / (results['precision'] + results['recall'])
    else:
        results['f1_score'] = 0.0
    
    return results

# Compute metrics for all fields
field_metrics = {}

for field in CONFIG['fields_to_evaluate']:
    print(f"Computing metrics for field: {field}")
    field_metrics[field] = compute_field_metrics(field, ground_truth, minna_parser_results, CONFIG['test_documents'])

print("Field metrics computed for all fields")


## 7. Display Results Summary


In [None]:
# Create summary DataFrame
summary_data = []
for field, metrics in field_metrics.items():
    summary_data.append({
        'Field': field,
        'Precision': f"{metrics['precision']:.3f}",
        'Recall': f"{metrics['recall']:.3f}",
        'F1-Score': f"{metrics['f1_score']:.3f}",
        'GT Present': metrics['gt_present'],
        'Parser Present': metrics['parser_present'],
        'Matches': metrics['matches'],
        'Total Docs': metrics['total_documents']
    })

summary_df = pd.DataFrame(summary_data)
print("FIELD-LEVEL METRICS SUMMARY")
print("=" * 80)
print(summary_df.to_string(index=False))

# Identify fields with frequent misses
print("\nFIELDS WITH FREQUENT MISSES:")
print("=" * 40)
for field, metrics in field_metrics.items():
    if metrics['f1_score'] < 0.5:  # Threshold for "frequent misses"
        print(f"{field}: F1={metrics['f1_score']:.3f} (Precision={metrics['precision']:.3f}, Recall={metrics['recall']:.3f})")
    elif metrics['f1_score'] < 0.8:
        print(f"{field}: F1={metrics['f1_score']:.3f} (Precision={metrics['precision']:.3f}, Recall={metrics['recall']:.3f})")
    else:
        print(f"{field}: F1={metrics['f1_score']:.3f} (Precision={metrics['precision']:.3f}, Recall={metrics['recall']:.3f})")


## 8. Track Template-Level Coverage


In [None]:
# Analyze template coverage
template_coverage = {
    'T1': {'documents': [], 'avg_f1': 0.0},
    'T2': {'documents': [], 'avg_f1': 0.0}, 
    'T3': {'documents': [], 'avg_f1': 0.0}
}

for doc_name in CONFIG['test_documents']:
    # Extract template from filename (e.g., invoice_T1_gen1.pdf -> T1)
    template = doc_name.split('_')[1] if '_' in doc_name else 'Unknown'
    
    if template in template_coverage:
        template_coverage[template]['documents'].append(doc_name)
        
        # Calculate average F1 for this document across all fields
        doc_f1_scores = []
        for field, metrics in field_metrics.items():
            doc_result = next((r for r in metrics['document_results'] if r['document'] == doc_name), None)
            if doc_result:
                # Use field-level F1 as proxy for document-level performance
                doc_f1_scores.append(metrics['f1_score'])
        
        if doc_f1_scores:
            template_coverage[template]['avg_f1'] = np.mean(doc_f1_scores)

print("TEMPLATE-LEVEL COVERAGE")
print("=" * 50)
for template, data in template_coverage.items():
    print(f"\n🔹 Template {template}:")
    print(f"   Documents: {data['documents']}")
    print(f"   Average F1: {data['avg_f1']:.3f}")
    
    if len(data['documents']) == 0:
        print(f"   WARNING: No documents found for template {template}")
    elif data['avg_f1'] < 0.5:
        print(f"   Poor performance on template {template}")
    elif data['avg_f1'] < 0.8:
        print(f"   Moderate performance on template {template}")
    else:
        print(f"Good performance on template {template}")

# Verify all 3 templates are represented
missing_templates = [t for t, data in template_coverage.items() if len(data['documents']) == 0]
if missing_templates:
    print(f"\nMISSING TEMPLATES: {missing_templates}")
    print("   Please ensure all 3 templates (T1, T2, T3) are represented in test documents")
else:
    print(f"\nAll 3 templates are represented in the benchmark")


## 9. Save Results in Versioned Output


In [None]:
# Prepare comprehensive results
benchmark_results = {
    'metadata': {
        'version': CONFIG['version'],
        'timestamp': datetime.now().isoformat(),
        'test_documents': CONFIG['test_documents'],
        'fields_evaluated': CONFIG['fields_to_evaluate'],
        'total_documents': len(CONFIG['test_documents'])
    },
    'field_metrics': field_metrics,
    'template_coverage': template_coverage,
    'summary': {
        'overall_avg_precision': np.mean([m['precision'] for m in field_metrics.values()]),
        'overall_avg_recall': np.mean([m['recall'] for m in field_metrics.values()]),
        'overall_avg_f1': np.mean([m['f1_score'] for m in field_metrics.values()]),
        'fields_with_frequent_misses': [
            field for field, metrics in field_metrics.items() 
            if metrics['f1_score'] < 0.5
        ]
    }
}

# Create output directory
output_dir = Path(CONFIG['output_dir'])
output_dir.mkdir(parents=True, exist_ok=True)

# Save JSON results
json_filename = f"benchmark_{CONFIG['version']}.json"
json_path = output_dir / json_filename

with open(json_path, 'w') as f:
    json.dump(benchmark_results, f, indent=2, default=str)

print(f"JSON results saved to: {json_path}")

# Save CSV summary
csv_filename = f"benchmark_{CONFIG['version']}_summary.csv"
csv_path = output_dir / csv_filename

summary_df.to_csv(csv_path, index=False)
print(f"CSV summary saved to: {csv_path}")

# Save detailed CSV with document-level results
detailed_data = []
for field, metrics in field_metrics.items():
    for doc_result in metrics['document_results']:
        detailed_data.append({
            'field': field,
            'document': doc_result['document'],
            'gt_value': doc_result['gt_value'],
            'parser_value': doc_result['parser_value'],
            'match': doc_result['match'],
            'match_type': doc_result['match_type'],
            'field_precision': metrics['precision'],
            'field_recall': metrics['recall'],
            'field_f1': metrics['f1_score']
        })

detailed_df = pd.DataFrame(detailed_data)
detailed_filename = f"benchmark_{CONFIG['version']}_detailed.csv"
detailed_path = output_dir / detailed_filename
detailed_df.to_csv(detailed_path, index=False)

print(f"Detailed CSV saved to: {detailed_path}")

print(f"\nBenchmark results saved successfully!")
print(f"Overall Performance:")
print(f"   Average Precision: {benchmark_results['summary']['overall_avg_precision']:.3f}")
print(f"   Average Recall: {benchmark_results['summary']['overall_avg_recall']:.3f}")
print(f"   Average F1-Score: {benchmark_results['summary']['overall_avg_f1']:.3f}")


## 10. Next Steps

**TODO Tasks for when parser JSONs are available:**

1. **Update Section 4**: Replace the placeholder parser loading logic with actual JSON file loading
2. **Update CONFIG**: Set the correct path to `minna_parser_dir`
3. **Verify JSON format**: Ensure the parser JSONs match the expected field structure
4. **Re-run benchmark**: Execute all cells to get real metrics

**Current Status:**
- Ground truth loading implemented
- Field comparison functions implemented  
- Precision/Recall/F1 calculation implemented
- Template coverage tracking implemented
- Versioned output saving implemented
- Waiting for parser JSONs to complete the benchmark
