# PDF Processing via LLMaaS

## 0. Setting your Name and Email

Please starting by putting your name and email in the following variables - please stick to the required format i.e. NAME_SURNAME

In [1]:
# WRITE YOUR NAME_SURNAME HERE, AS WELL AS YOUR EMAIL WITH WHICH YOU LOGGED IN INTO CELONIS
MY_NAME = 'SCHUMANN'
MY_EMAIL = 'schumann.marvin@outlook.com'

## 1. Installing and importing required packages

In [None]:
#Run the first time you execute the script and then comment it out again.
!pip install --extra-index-url=https://pypi.celonis.cloud/ pycelonis
!pip install nbformat

Looking in indexes: https://pypi.org/simple, https://pypi.celonis.cloud/


In [14]:
# Step 1 – environment setup
import sys
import subprocess

def ensure_package(pkg):
    try:
        __import__(pkg)
    except ImportError:
        subprocess.check_call([sys.executable, "-m", "pip", "install", pkg])

# Core packages required downstream
for pkg_name in [
    "torch",
    "transformers",
    "accelerate",
    "pillow",
    "pdf2image",
    "pandas",
    "tqdm",
    "numpy",
    "einops",
    "timm"
]:
    ensure_package(pkg_name)

import os
import re
import json
from pathlib import Path

import torch
import pandas as pd
import numpy as np
from PIL import Image
from tqdm import tqdm
from pdf2image import convert_from_path
from transformers import AutoModelForCausalLM, AutoProcessor

pd.set_option("display.max_columns", None)
pd.set_option("display.width", 120)

Collecting einops
  Downloading einops-0.8.1-py3-none-any.whl.metadata (13 kB)
Downloading einops-0.8.1-py3-none-any.whl (64 kB)
Installing collected packages: einops
Successfully installed einops-0.8.1
Collecting timm
  Downloading timm-1.0.22-py3-none-any.whl.metadata (63 kB)
Downloading timm-1.0.22-py3-none-any.whl (2.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m51.5 MB/s[0m  [33m0:00:00[0m
[?25hInstalling collected packages: timm
Successfully installed timm-1.0.22


## 2. Extract information from Invoices

This is the section you will need to fill in. Your code should create the following
- **a pandas dataframe called df that includes the extracted information**.
- **the dataframe should contain a column called 'po_reference' that contains the reference to the PO**
- **the values in the column 'po_reference' should be a 11-char long strings. Use left padding with zeros where needed.**

In [None]:
# Step 2 – load invoice pages as images

INVOICE_DIR = Path("/Users/marvinschumann/orbit_challenge/Invoices")
POPPLER_PATH = os.getenv("POPPLER_PATH")  # set if poppler isn't on PATH

def load_invoice_pages(invoice_dir: Path) -> list[dict]:
    pages = []
    files = sorted([p for p in invoice_dir.iterdir() if p.is_file()])
    for file_path in tqdm(files, desc="Loading invoices"):
        suffix = file_path.suffix.lower()
        invoice_id = file_path.stem.strip()
        if suffix == ".pdf":
            images = convert_from_path(
                str(file_path),
                dpi=200,
                poppler_path=POPPLER_PATH,
                fmt="png"
            )
            for idx, img in enumerate(images, start=1):
                pages.append(
                    {
                        "invoice_id": invoice_id,
                        "page_index": idx,
                        "image": img.convert("RGB"),
                        "source_path": str(file_path),
                    }
                )
        elif suffix in {".png", ".jpg", ".jpeg"}:
            img = Image.open(file_path).convert("RGB")
            pages.append(
                {
                    "invoice_id": invoice_id,
                    "page_index": 1,
                    "image": img,
                    "source_path": str(file_path),
                }
            )
        else:
            print(f"Skipping unsupported file: {file_path.name}")
    return pages

invoice_pages = load_invoice_pages(INVOICE_DIR)
print(f"Loaded {len(invoice_pages)} page(s) from {len({p['invoice_id'] for p in invoice_pages})} invoice file(s).")

Loading invoices: 100%|██████████| 5/5 [00:03<00:00,  1.31it/s]

Loaded 5 page(s) from 5 invoice file(s).





In [None]:
# Steps 3 & 4 – initialize InternVL and extract invoice fields (enhanced)
from transformers import AutoTokenizer, AutoImageProcessor, AutoModelForCausalLM
from tqdm import tqdm
import torch
import json
import re

MODEL_ID = "OpenGVLab/InternVL2-1B"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# model and processors
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
image_processor = AutoImageProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    trust_remote_code=True,
    dtype=torch.float16 if DEVICE.type == "cuda" else torch.float32,
    device_map="auto" if DEVICE.type == "cuda" else None,
)
model.to(DEVICE)
model.eval()

gen_config_dict = {
    "max_new_tokens": 512,
    "temperature": 0.1,
    "top_p": 0.9,
}

REQUIRED_FIELDS = [
    "vendor_name",
    "vendor_address",
    "payment_terms",
    "invoice_value",
    "company_code",
    "po_reference",
    "invoice_id",
]

FIELD_DESCRIPTIONS = {
    "vendor_name": "the legal name of the vendor or supplier",
    "vendor_address": "the full postal address of the vendor",
    "payment_terms": "the payment terms (e.g., Net 30, Due on receipt)",
    "invoice_value": "the total invoice amount including currency symbol",
    "company_code": "the company code associated with the purchase order",
    "po_reference": "the purchase order reference or number",
    "invoice_id": "the invoice number or identifier",
}

FIELD_PROMPT_HINTS = {
    "vendor_name": "Return only the vendor's name (no invoice numbers).",
    "vendor_address": "Return the full mailing address as shown on the invoice.",
    "payment_terms": "Return the payment terms text exactly as printed.",
    "invoice_value": "Return the total invoice amount including the currency symbol if present.",
    "company_code": "Return the company code as printed (numbers only, no labels).",
    "po_reference": "Return only the digits of the purchase order reference (no prefixes or text).",
    "invoice_id": "Return the invoice number exactly as printed on the document.",
}

FIELD_VALIDATORS = {
    "po_reference": lambda value: bool(re.search(r"\d", value)),
    "company_code": lambda value: bool(re.search(r"\d", value)),
    "invoice_value": lambda value: bool(re.search(r"\d", value)),
    "invoice_id": lambda value: bool(re.search(r"\d", value)),
}

INVALID_FIELD_STRINGS = {
    "",
    "unknown",
    "undefined",
    "not provided",
    "not applicable",
    "n/a",
    "na",
    "none",
    "null",
}

PRIMARY_PROMPT = (
    "You are an expert invoice analyst. Carefully read the invoice image and return a JSON object "
    "with the following keys: vendor_name, vendor_address, payment_terms, invoice_value, company_code, "
    "po_reference, invoice_id. Use double quotes for all keys and string values. If a value is missing, "
    "respond with the literal string UNKNOWN for that field. Do not add commentary or extra keys."
)

SINGLE_FIELD_PROMPT_TEMPLATE = (
    "You previously inspected this invoice. Provide the value for the field '{field}' ({description}). "
    "Return a JSON object containing only the key '{field}' with its value. Use double quotes and "
    "respond with the literal string UNKNOWN if the value cannot be determined."
)

MISSING_FIELDS_PROMPT_TEMPLATE = (
    "The following fields could not be confirmed: {fields}. Review the invoice image again and respond "
    "with a JSON object that contains these keys only. Use double quotes and set any unknown value to "
    "the literal string UNKNOWN."
)

MAX_PRIMARY_ATTEMPTS = 3
MAX_FIELD_ATTEMPTS = 2

def clean_response_text(text: str) -> str:
    cleaned = text.strip()
    if cleaned.startswith("```"):
        cleaned = re.sub(r"^```(?:json)?", "", cleaned, flags=re.IGNORECASE).strip()
        cleaned = cleaned.rstrip("`").strip()
    cleaned = cleaned.replace("\u201c", '"').replace("\u201d", '"')
    return cleaned

def parse_json_response(raw_text: str, expected_fields=None) -> dict:
    cleaned = clean_response_text(raw_text)
    expected_fields = expected_fields or REQUIRED_FIELDS
    try:
        parsed = json.loads(cleaned)
        if isinstance(parsed, list) and len(parsed) == 1 and isinstance(parsed[0], dict):
            parsed = parsed[0]
        if isinstance(parsed, dict):
            return parsed
    except Exception:
        pass

    extracted = {}
    for field in expected_fields:
        pattern = rf'"{field}"\s*:\s*"([^"]*)"'
        match = re.search(pattern, cleaned)
        if match:
            extracted[field] = match.group(1)
    return extracted

def is_valid_field(value: str, field: str | None = None) -> bool:
    if value is None:
        return False
    normalized = str(value).strip()
    if not normalized or normalized.lower() in INVALID_FIELD_STRINGS:
        return False
    if field:
        normalized_plain = normalized.lower().replace(" ", "")
        field_plain = field.replace("_", "").lower()
        if normalized_plain == field_plain:
            return False
        validator = FIELD_VALIDATORS.get(field)
        if validator and not validator(normalized):
            return False
    return True

def merge_fields(target: dict, updates: dict) -> None:
    for field, value in updates.items():
        if field not in target or field == "invoice_id":
            continue
        if is_valid_field(value, field) and not is_valid_field(target.get(field, ""), field):
            target[field] = str(value).strip()

def get_missing_fields(record: dict) -> list:
    return [
        field
        for field in REQUIRED_FIELDS
        if field != "invoice_id" and not is_valid_field(record.get(field, ""), field)
    ]

def run_chat_once(pixel_values, prompt, history=None):
    response = model.chat(
        tokenizer=tokenizer,
        pixel_values=pixel_values,
        question=prompt,
        history=history,
        generation_config=gen_config_dict,
    )
    if isinstance(response, tuple) and len(response) == 2:
        answer, new_history = response
    else:
        answer = response
        new_history = history
    return answer.strip(), new_history

def extract_page_metadata(page_entry):
    image = page_entry["image"]
    invoice_id = page_entry["invoice_id"]
    pixel_values = image_processor(images=image, return_tensors="pt").pixel_values.to(DEVICE)

    aggregated = {field: "" for field in REQUIRED_FIELDS}
    aggregated["invoice_id"] = invoice_id
    raw_attempts = []
    history = None

    # Primary attempts
    for attempt in range(1, MAX_PRIMARY_ATTEMPTS + 1):
        response_text, history = run_chat_once(pixel_values, PRIMARY_PROMPT, history=None)
        raw_attempts.append(
            {
                "attempt_type": "primary",
                "attempt": attempt,
                "prompt": PRIMARY_PROMPT,
                "response": response_text,
            }
        )
        parsed = parse_json_response(response_text)
        merge_fields(aggregated, parsed)
        if not get_missing_fields(aggregated):
            break

    missing = get_missing_fields(aggregated)

    # Targeted follow-ups per missing field
    for field in missing.copy():
        description = FIELD_DESCRIPTIONS.get(field, field)
        prompt = SINGLE_FIELD_PROMPT_TEMPLATE.format(field=field, description=description)
        hint = FIELD_PROMPT_HINTS.get(field)
        if hint:
            prompt = f"{prompt} {hint}"
        field_history = history  # reuse most recent history if available
        for attempt in range(1, MAX_FIELD_ATTEMPTS + 1):
            response_text, field_history = run_chat_once(pixel_values, prompt, history=field_history)
            raw_attempts.append(
                {
                    "attempt_type": "single_field",
                    "field": field,
                    "attempt": attempt,
                    "prompt": prompt,
                    "response": response_text,
                }
            )
            parsed = parse_json_response(response_text, expected_fields=[field])
            merge_fields(aggregated, parsed)
            if field not in get_missing_fields(aggregated):
                break

    missing = get_missing_fields(aggregated)

    # Final multi-field prompt if any are still missing
    if missing:
        prompt = MISSING_FIELDS_PROMPT_TEMPLATE.format(fields=", ".join(missing))
        hint_text = " ".join(filter(None, (FIELD_PROMPT_HINTS.get(field) for field in missing)))
        if hint_text:
            prompt = f"{prompt} {hint_text}"
        response_text, _ = run_chat_once(pixel_values, prompt, history=history)
        raw_attempts.append(
            {
                "attempt_type": "missing_fields",
                "fields": list(missing),
                "prompt": prompt,
                "response": response_text,
            }
        )
        parsed = parse_json_response(response_text, expected_fields=missing)
        merge_fields(aggregated, parsed)

    final_missing = get_missing_fields(aggregated)

    result_record = {field: aggregated.get(field, "") for field in REQUIRED_FIELDS}
    result_record["missing_fields"] = final_missing
    result_record["raw_attempts"] = raw_attempts

    enriched_entry = {
        **page_entry,
        **result_record,
    }
    # remove heavy pixel tensor to avoid retaining GPU memory
    enriched_entry.pop("image", None)
    return enriched_entry

page_results = [
    extract_page_metadata(page)
    for page in tqdm(invoice_pages, desc="Extracting data")
]

print(f"Extraction complete for {len(page_results)} pages.")

Extracting data:   0%|          | 0/5 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Extracting data:  20%|██        | 1/5 [00:11<00:47, 11.80s/it]Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Extracting data:  40%|████      | 2/5 [00:16<00:23,  7.77s/it]Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Extracting data:  60%|██████    | 3/5 [00:19<00:11,  5.63s/it]Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Extracting data:  80%|████████  | 4/5 [00:25<00:05,  5.49s/it]Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Extracting data: 100%|██████████| 5/5 [00:31<00:00,  6.24s/it]

Extraction complete for 5 pages.





In [None]:
# Step 5 – consolidate page results into final dataframe

def consolidate_invoice_pages(pages):
    consolidated = {}
    for entry in pages:
        inv_id = entry["invoice_id"]
        if inv_id not in consolidated:
            consolidated[inv_id] = {
                field: "" for field in REQUIRED_FIELDS
            }
            consolidated[inv_id]["invoice_id"] = inv_id
            consolidated[inv_id]["raw_attempts"] = []
            consolidated[inv_id]["missing_fields"] = set()

        consolidated_entry = consolidated[inv_id]

        for field in REQUIRED_FIELDS:
            if field == "invoice_id":
                continue
            candidate = str(entry.get(field, "")).strip()
            if is_valid_field(candidate, field) and not is_valid_field(consolidated_entry.get(field, ""), field):
                consolidated_entry[field] = candidate

        consolidated_entry["raw_attempts"].extend(entry.get("raw_attempts", []))
        consolidated_entry["missing_fields"].update(entry.get("missing_fields", []))

    records = []
    quality_issues = []
    for inv_id, data in consolidated.items():
        records.append({field: data.get(field, "") for field in REQUIRED_FIELDS})
        unresolved = [
            field for field in REQUIRED_FIELDS
            if field != "invoice_id" and not is_valid_field(data.get(field, ""), field)
        ]
        if unresolved:
            quality_issues.append({"invoice_id": inv_id, "missing_fields": unresolved})

    return records, consolidated, quality_issues

invoice_records, extraction_diagnostics, extraction_issues = consolidate_invoice_pages(page_results)

df = pd.DataFrame(invoice_records, columns=REQUIRED_FIELDS)

display(df)

if extraction_issues:
    print("Warning: Some invoices still have unresolved fields:")
    display(pd.DataFrame(extraction_issues))

Unnamed: 0,vendor_name,vendor_address,payment_terms,invoice_value,company_code,po_reference,invoice_id
0,5578 Vendor Street,Business District,Due within 30 days,"€3,360.00",5578,000008568534,INV-2020-08-001247
1,Vendor34,789 Vendor Street,Late payments may incur interest charges,"£1,400.00",CompanyCode4,00000048334,INV-2020-001
2,,,,,,00000000000,INV-2020-07-001853
3,Vendor45,789 Vendor Street,Late payments may incur interest charges at 1....,"€5,057.50",CompanyCode3,00000000181,INV-2021-001
4,Vendor34,"789 Vendor Street, Vendor City, State 54321","Due Dates: March 5, 2021 Due Dates: March 20, ...","€1,500.00",123456789,PO_123456789,INV-2021-002


vendor_name       0
vendor_address    0
payment_terms     0
invoice_value     0
company_code      0
po_reference      0
invoice_id        0
dtype: int64


In [None]:
# Step 6 – normalize values and validate completeness

import re


def sanitize_po(po_value: str) -> str:
    digits = re.sub(r"\D", "", po_value or "")
    return digits.zfill(11) if digits else ""

df = df.copy()
df["po_reference"] = df["po_reference"].apply(sanitize_po)

validation_issues = []
for _, row in df.iterrows():
    missing = [
        field for field in REQUIRED_FIELDS
        if field != "invoice_id" and not is_valid_field(row.get(field, ""))
    ]
    if missing:
        validation_issues.append({
            "invoice_id": row["invoice_id"],
            "missing_fields": missing,
        })

if validation_issues:
    print("Extraction validation failed. Review diagnostics below.")
    display(pd.DataFrame(validation_issues))
    if 'extraction_diagnostics' in globals():
        debug_rows = []
        for issue in validation_issues:
            diag = extraction_diagnostics.get(issue["invoice_id"], {})
            debug_rows.append({
                "invoice_id": issue["invoice_id"],
                "missing_fields": issue["missing_fields"],
                "raw_attempts": diag.get("raw_attempts", []),
            })
        print("Detailed raw attempts for problematic invoices:")
        display(pd.DataFrame(debug_rows))
    raise ValueError("Not all required invoice fields could be extracted automatically.")

print("All invoice records passed validation.")
display(df)

## 3. Pushing Data back to Data Pool

In [None]:
%run push.ipynb