# Showcase Pipeline — R&D Notebook

Razvoj extraction + validation + retry + formatting pipeline-a
test sve prije implementacije backend verzije u FastAPI.


In [20]:
import json
import re
from datetime import datetime

import pytesseract
from PIL import Image


In [21]:
# Simulacija OCR/tekstualni ekstrakt, prije nego što dodam prave servise

mock_text = '''
Invoice No: 12345
Date: 2025-02-10
Customer: John Doe
Items:
 - Laptop, 1, 1000
 - Mouse, 2, 20
Total: 1040
'''

print(mock_text)



Invoice No: 12345
Date: 2025-02-10
Customer: John Doe
Items:
 - Laptop, 1, 1000
 - Mouse, 2, 20
Total: 1040



In [22]:
# LLM će kasnije vratiti ovakav JSON, sada ga samo simuliramo

extracted_json = {
    "invoice_number": "12345",
    "date": "2025-02-10",
    "customer": "John Doe",
    "items": [
        {"name": "Laptop", "qty": 1, "price": 1000},
        {"name": "Mouse", "qty": 2, "price": 20}
    ],
    "total": 1040
}

extracted_json


{'invoice_number': '12345',
 'date': '2025-02-10',
 'customer': 'John Doe',
 'items': [{'name': 'Laptop', 'qty': 1, 'price': 1000},
  {'name': 'Mouse', 'qty': 2, 'price': 20}],
 'total': 1040}

In [23]:
# testiram logiku validacije: nedostajuća polja, datum, suma stavki

def validate_invoice(data: dict):
    errors = []

    required_fields = ["invoice_number", "date", "customer", "items", "total"]
    for f in required_fields:
        if f not in data:
            errors.append(f"Missing field: {f}")
        elif data[f] in [None, ""]:
            errors.append(f"Empty field: {f}")

    try:
        datetime.fromisoformat(data["date"])
    except:
        errors.append("Invalid date format")

    calc = sum(i["qty"] * i["price"] for i in data["items"])
    if calc != data["total"]:
        errors.append(f"Total mismatch: items sum = {calc}, invoice total = {data['total']}")

    return errors


validate_invoice(extracted_json)


[]

In [24]:
bad_invoice = {
    "invoice_number": "",
    "date": "2025/02/10",
    "customer": "John Doe",
    "items": [
        {"name": "Laptop", "qty": 1, "price": 1000},
        {"name": "Mouse", "qty": 2, "price": 20}
    ],
    "total": 999
}

In [25]:
validate_invoice(bad_invoice)


['Empty field: invoice_number',
 'Invalid date format',
 'Total mismatch: items sum = 1040, invoice total = 999']

Iznad vidimo da je trenutni validator uhvatio grešku u datumu (nije iso format), i mismatch u sumi novca, ali ne vidi da je invoice number prazno polje?
'''
  for f in required_fields:
        if f not in data:
            errors.append(f"Missing field: {f}")
'''

In [26]:
bad_invoice1 = {
    "invoice_number": "",
    "date": "2025/02/10",
    "items": [
        {"name": "Laptop", "qty": 1, "price": 1000},
        {"name": "Mouse", "qty": 2, "price": 20}
    ],
    "total": 999
}

In [27]:
validate_invoice(bad_invoice1)

['Empty field: invoice_number',
 'Missing field: customer',
 'Invalid date format',
 'Total mismatch: items sum = 1040, invoice total = 999']

Dakle, missing field se javlja samo kada se data varijabla uopste ne pojavljuje u dictionary, ne i ako je polje prazno!

In [28]:
# ako validator nađe grešku, ovdje se generiše novi prompt za LLM

def generate_retry_prompt(raw_text, errors):
    prompt = f"""
The extracted invoice data has the following validation errors:

{errors}

Please correct ONLY the problematic fields without changing other fields.
Use the information from the raw invoice text below:

{raw_text}
"""
    return prompt


In [29]:
bad_invoice_text = '''
Invoice No: 
Date: 2025/02/10
Customer: John Doe
Items:
 - Laptop, 1, 1000
 - Mouse, 2, 20
Total: 999
'''

errors = validate_invoice(bad_invoice1)
retry_prompt = generate_retry_prompt(bad_invoice_text, errors)
print(retry_prompt)


The extracted invoice data has the following validation errors:

['Empty field: invoice_number', 'Missing field: customer', 'Invalid date format', 'Total mismatch: items sum = 1040, invoice total = 999']

Please correct ONLY the problematic fields without changing other fields.
Use the information from the raw invoice text below:


Invoice No: 
Date: 2025/02/10
Customer: John Doe
Items:
 - Laptop, 1, 1000
 - Mouse, 2, 20
Total: 999




Postavka za stvarni validator

- OCR preko Tesseract - osjetljiv je na niske rezolucije, nečitak tekst, lošu separaciju itd. ali besplatan i jednostavan baseline!

In [30]:
def extract_with_ocr(image_path: str) -> Dict:
    """Simple OCR extraction using pytesseract and regex heuristics.

    Returns a dict matching the invoice schema used across the project.
    """
    text = pytesseract.image_to_string(Image.open(image_path))

    # Heuristics/regexes (improve as needed)
    invoice_number = _search_first(r"Invoice\s*[#:\s]*([A-Za-z0-9-]+)", text)
    date = _search_first(r"Date[:\s]*([0-9/\-\.]+)", text)
    total = _search_first(r"Total\s*[:\-]?\s*\$?\s*([0-9,]+\.?[0-9]{0,2})", text)
    vendor = _search_first(r"^([A-Z][A-Za-z0-9 &,-]{2,})", text, flags=re.M)

    # naive line items extraction: lines containing a price at end
    lines = text.splitlines()
    line_items = []
    for line in lines:
        if re.search(r"\d+\.?\d{0,2}$", line.strip()):
            parts = line.strip().rsplit(None, 1)
            desc = parts[0]
            price = parts[1] if len(parts)>1 else ""
            line_items.append({"description": desc, "quantity": "", "unit_price": "", "total_price": price})

    return {
        "vendor_name": vendor or "",
        "invoice_number": invoice_number or "",
        "invoice_date": date or "",
        "due_date": "",
        "bill_to": "",
        "line_items": line_items,
        "subtotal": "",
        "tax": "",
        "grand_total": total or "",
        "payment_info": "",
        "raw_text": text
    }


def _search_first(pattern, text, flags=0):
    """Return first regex group or None."""
    m = re.search(pattern, text, flags)
    return m.group(1).strip() if m else None


U primjeru iznad (preuzeto sa https://github.com/ShafqaatMalik/llm-based-invoice-ocr), problemi koji mogu dovesti do toga da regex nije pronađen su:

-Ako OCR ne pročita dokument ispravno
-Ako je datum u neodgovarajućem formatu

In [31]:
raw_text = '''ACME Corporation
Invoice  #INV-00921-A
Date: 2024/05/18
Total: $492.20

Item A  200.00
Item B  292.20'''

In [32]:
def extract_invoice_number(text: str):
    text = raw_text

    patterns = [
        r"(?i)invoice\s*[#:\s]*([\w\-]+)",
        r"(?i)inv(?:oice)?\s*([\w\-]+)",
        r"([\w]{3,}-\d{3,})"
    ]

    for pattern in patterns:
        match = re.search(pattern, text)

        if match:
            return match.group(1)
        
    return None

In [33]:
def extract_invoice_number_with_confidence(text: str):
    patterns = [
        (r"(?i)invoice\s*[#:\s]*([\w\-]+)", 0.95),
        (r"(?i)inv(?:oice)?\s*([\w\-]+)", 0.75),
        (r"([\w]{3,}-\d{3,})", 0.45)
    ]
    
    for pattern, confidence in patterns:
        match = re.search(pattern, text)
        if match:
            return {"value": match.group(1), "confidence": confidence}
    
    return {"value": None, "confidence": 0.0}


In [34]:
extract_invoice_number_with_confidence(raw_text)

{'value': 'INV-00921-A', 'confidence': 0.95}

In [35]:
from typing import Dict, List

def extract_invoice_fields(text: str) -> Dict:
    regex_map = {
        "invoice_number": [
            r"(?i)invoice\s*[#:\s]*([\w\-]+)",
            r"(?i)inv(?:oice)?\s*([\w\-]+)",
            r"([\w]{3,}-\d{3,})"
        ],
        "invoice_date": [
            r"Date[:\s]*([0-9]{4}[/\-\.][0-9]{2}[/\-\.][0-9]{2})",
            r"([0-9]{2}[/\-\.][0-9]{2}[/\-\.][0-9]{4})"
        ],
        "vendor_name": [
            r"^([A-Z][A-Za-z0-9 &,-]{2,})" #vendorname mora biti u prvom redu i po;eti velikim slovom
        ],
        "total_amount": [
            r"Total\s*[:\-]?\s*\$?\s*([0-9,]+\.?[0-9]{0,2})"
        ]
    }

    results = {}
    for field, patterns in regex_map.items():
        value = None
        for pattern in patterns:
            match = re.search(pattern, text, flags=re.MULTILINE|re.IGNORECASE)
            if match:
                value = match.group(1).strip()
                break
        results[field] = value

    lines = text.splitlines() #razdvajanje teksta u odvojene stringove u listi
    line_items = []
    for line in lines:                                 #provjera da li linija yavrsava brojem (cijena stavke)
        if re.search(r"\d+\.?\d{0,2}$", line.strip()): #line.strip uklanja whitespace
            parts = line.strip().rsplit(None, 1)       #dijelimo liniju sa cijenom na dvoje, opis stavke i cijena
            desc = parts[0]
            price = parts[1] if len(parts) > 1 else ""
            line_items.append({
                "description": desc,
                "total_price": price
            })

    results["line_items"] = line_items

    return results

In [36]:
extract_invoice_fields(raw_text)

{'invoice_number': 'INV-00921-A',
 'invoice_date': '2024/05/18',
 'vendor_name': 'ACME Corporation',
 'total_amount': '492.20',
 'line_items': [{'description': 'Date:', 'total_price': '2024/05/18'},
  {'description': 'Total:', 'total_price': '$492.20'},
  {'description': 'Item A', 'total_price': '200.00'},
  {'description': 'Item B', 'total_price': '292.20'}]}

In [41]:
def validate_invoice_fields(results: Dict, text: str) -> Dict:
    """
    Takes extracted invoice fields and validates them.
    Adds confidence, required flag and basic validity checks.
    """
    schema = {
        "invoice_number": {"required": True, "type": str},
        "invoice_date": {"required": True, "type": "date"},
        "vendor_name": {"required": True, "type": str},
        "total_amount": {"required": True, "type": float},
        "line_items": {"required": True, "type": list}
    }
    
    validated = {}
    
    for field, info in schema.items():
        value = results.get(field)
        valid = True
        confidence = 1.0 if value else 0.0
        
        # baratanje sa brojem fakture (confidence scores)
        if field == "invoice_number":
            inv_result = extract_invoice_number_with_confidence(text)
            value = inv_result["value"]
            confidence = inv_result["confidence"]
            valid = bool(value)  # valid if value found
        
        # provjera tipa podataka, za ostala polja:
        elif value:
            if info["type"] == float:
                try:
                    float(value)
                except:
                    valid = False
            elif info["type"] == "date":
                try:
                    datetime.strptime(value, "%Y/%m/%d")
                except:
                    valid = False
            elif info["type"] == list:
                if not isinstance(value, list):
                    valid = False
                else:
                    for item in value:
                        if "description" not in item or "total_price" not in item:
                            valid = False
                            break
            # za string ne radim dodatnu validaciju
    
        if info["required"] and (value is None or value == ""):
            valid = False
            confidence = 0.0
        
        validated[field] = {
            "value": value,
            "confidence": confidence,
            "required": info["required"],
            "valid": valid
        }
    
    return validated

In [43]:
validate_invoice_fields(extract_invoice_fields(sample_text), sample_text)

{'invoice_number': {'value': 'INV-00921-A',
  'confidence': 0.95,
  'required': True,
  'valid': True},
 'invoice_date': {'value': '2024/05/18',
  'confidence': 1.0,
  'required': True,
  'valid': True},
 'vendor_name': {'value': 'ACME Corporation',
  'confidence': 1.0,
  'required': True,
  'valid': True},
 'total_amount': {'value': '492.20',
  'confidence': 1.0,
  'required': True,
  'valid': True},
 'line_items': {'value': [{'description': 'Date:',
    'total_price': '2024/05/18'},
   {'description': 'Total:', 'total_price': '$492.20'},
   {'description': 'Item A', 'total_price': '200.00'},
   {'description': 'Item B', 'total_price': '292.20'},
   {'description': 'Total', 'total_price': '350.00'}],
  'confidence': 1.0,
  'required': True,
  'valid': True}}

In [44]:
bad_sample_text = """acme corp
Inv #XYZ123
Dt: 18-05-2024
Tot: USD Four Ninety-Two

Item A two hundred
Item B 292,20
"""

validate_invoice_fields(extract_invoice_fields(bad_sample_text), bad_sample_text)

{'invoice_number': {'value': None,
  'confidence': 0.0,
  'required': True,
  'valid': False},
 'invoice_date': {'value': '18-05-2024',
  'confidence': 1.0,
  'required': True,
  'valid': False},
 'vendor_name': {'value': 'acme corp',
  'confidence': 1.0,
  'required': True,
  'valid': True},
 'total_amount': {'value': None,
  'confidence': 0.0,
  'required': True,
  'valid': False},
 'line_items': {'value': [{'description': 'Inv', 'total_price': '#XYZ123'},
   {'description': 'Dt:', 'total_price': '18-05-2024'},
   {'description': 'Item B', 'total_price': '292,20'}],
  'confidence': 1.0,
  'required': True,
  'valid': True}}