# Invoice Validation System

## Data overview

Quick stats for ground_truth.json, ocr_results.json, database.json.
This notebook expects the three JSON files in ./data or the repo root.


In [1]:
from pathlib import Path
import json
import pandas as pd
import numpy as np

def find_repo_root():
    candidates = [Path.cwd(), Path.cwd().parent]
    for base in candidates:
        if (base / 'data').exists() or (base / 'requirements.txt').exists():
            return base
    return Path.cwd()

ROOT_DIR = find_repo_root()
DATA_DIR_CANDIDATES = [ROOT_DIR / 'data', ROOT_DIR]

def load_json(name):
    for base in DATA_DIR_CANDIDATES:
        path = base / name
        if path.exists():
            return json.loads(path.read_text(encoding='utf-8')), path
    raise FileNotFoundError(f"Missing {name}. Put it in ./data or repo root.")

ground_truth, gt_path = load_json('ground_truth.json')
ocr_results, ocr_path = load_json('ocr_results.json')
database, db_path = load_json('database.json')

print('Loaded:', gt_path, ocr_path, db_path)


Loaded: c:\Users\minhs\Desktop\invoice-validation-system\invoice-validation-system\data\ground_truth.json c:\Users\minhs\Desktop\invoice-validation-system\invoice-validation-system\data\ocr_results.json c:\Users\minhs\Desktop\invoice-validation-system\invoice-validation-system\data\database.json


In [2]:
gt_invoices = ground_truth.get('invoices', [])
gt_ids = [inv.get('invoice_id') for inv in gt_invoices if inv.get('invoice_id')]
ocr_ids = list(ocr_results.keys())

summary = {
    'gt_invoices': len(gt_ids),
    'ocr_invoices': len(ocr_ids),
    'in_both': len(set(gt_ids) & set(ocr_ids)),
    'missing_in_ocr': len(set(gt_ids) - set(ocr_ids)),
    'missing_in_gt': len(set(ocr_ids) - set(gt_ids)),
}
pd.DataFrame([summary])


Unnamed: 0,gt_invoices,ocr_invoices,in_both,missing_in_ocr,missing_in_gt
0,3,3,3,0,0


In [3]:
def collect_fields(gt_invoices, ocr_results):
    fields = set()
    for inv in gt_invoices:
        fields.update(inv.get('expected_data', {}).keys())
    for inv in ocr_results.values():
        fields.update(inv.get('structured_data', {}).keys())
    return sorted(fields)

def is_missing(value):
    if value is None:
        return True
    if isinstance(value, str) and value.strip() == '':
        return True
    if isinstance(value, list) and len(value) == 0:
        return True
    return False

fields = collect_fields(gt_invoices, ocr_results)
total_ocr = len(ocr_results)

rows = []
for field in fields:
    missing_ocr = 0
    bbox_present = 0
    conf_present = 0
    for inv in ocr_results.values():
        structured = inv.get('structured_data', {})
        if field not in structured or is_missing(structured.get(field)):
            missing_ocr += 1
        if field in inv.get('bounding_boxes', {}):
            bbox_present += 1
        if field in inv.get('confidence_scores', {}):
            conf_present += 1
    rows.append({
        'field': field,
        'missing_ocr': missing_ocr,
        'missing_ocr_pct': (missing_ocr / total_ocr * 100) if total_ocr else 0.0,
        'bbox_coverage_pct': (bbox_present / total_ocr * 100) if total_ocr else 0.0,
        'confidence_coverage_pct': (conf_present / total_ocr * 100) if total_ocr else 0.0,
    })

missing_fields_df = pd.DataFrame(rows).sort_values(
    ['missing_ocr', 'field'], ascending=[False, True]
)

missing_fields_df


Unnamed: 0,field,missing_ocr,missing_ocr_pct,bbox_coverage_pct,confidence_coverage_pct
0,customer_address,1,33.333333,0.0,0.0
9,vendor_address,1,33.333333,0.0,0.0
1,customer_name,0,0.0,0.0,0.0
2,due_date,0,0.0,33.333333,66.666667
3,invoice_date,0,0.0,0.0,0.0
4,line_items,0,0.0,0.0,0.0
5,po_number,0,0.0,100.0,100.0
6,subtotal,0,0.0,0.0,0.0
7,tax_amount,0,0.0,33.333333,33.333333
8,total_amount,0,0.0,33.333333,66.666667


## Risk list (quick)

- PO number confusion O<->0 and I<->1
- Vendor/customer name truncated
- Address abbreviations (Drive->Dr, Street->St)
- Invoice/due date shifted by 1 day
- Small tax/amount differences
