# Credit OCR System Process

This notebook demonstrates a complete OCR pipeline for processing credit request documents.
It uses Azure Form Recognizer for OCR, Ollama for LLM-based field extraction, and 
provides visualization capabilities for extracted data.

Prerequisites:
- Docker installed and running
- Python 3.8+ with required packages (see requirements.txt/pyproject.toml)
- Azure Form Recognizer service configured
- Ollama running locally (automatically started via testcontainers)
- Sample credit request PDF document in tests/tmp/

The pipeline performs:
1. Document OCR using Azure Form Recognizer
2. Text extraction with bounding boxes and confidence scores
3. LLM-based field extraction using Ollama
4. Data visualization and validation
5. Database storage (optional)

Key components:
- Azure OCR client for document analysis
- Ollama client for LLM processing
- Field extractor for structured data extraction
- PDF visualizer for result validation
- Configuration management for various services

In [1]:
# Standard library imports
import json
import logging
import os
import sys
import time
import uuid
from datetime import datetime
from pathlib import Path
import requests

# Third-party imports
import pandas as pd

# Set up Python path and working directory FIRST
project_root = Path.cwd().parent  # Go up one level from notebooks/
sys.path.insert(0, str(project_root))
os.chdir(project_root)  # Change to project root for config loading

# Import backend modules (now that path is set up)
from src.config import AppConfig, DocumentProcessingConfig
from src.ocr.azure_ocr_client import analyze_single_document_with_azure
from src.ocr.postprocess import extract_text_lines_with_bbox_and_confidence
from src.visualization.pdf_visualizer import visualize_extracted_fields

In [2]:
# Import required modules with proper error handling
try:
    from testcontainers.core.container import DockerContainer
except ImportError as import_error:
    print(f"Import error: {import_error}")
    print("Please ensure you're running this from the project root directory")
    print(f"Current working directory: {os.getcwd()}")
    print(f"Python path: {sys.path}")
    raise

# Load configuration with simple string path (now from project root)
app_config: AppConfig = AppConfig("config")
print(f"Model name: {app_config.generative_llm.model_name}")
print(f"URL: {app_config.generative_llm.url}")

# Use current working directory instead of __file__ for notebook compatibility
current_working_directory: str = str(Path.cwd())
cache_directory: str = "ollama_cache_generative"

# Start the generative model with ollama using Docker container (fixed port)
print("Starting Ollama test container...")
ollama_container = DockerContainer("ollama/ollama:0.5.13")
ollama_container.with_exposed_ports(11435)
ollama_container.with_bind_ports(11434, 11435)  # Fixed port mapping
ollama_container.with_volume_mapping(f"{current_working_directory}/data/{cache_directory}", "/root/.ollama", "rw")
ollama_container.start()
ollama_port: int = 11435  # Use fixed port instead of dynamic
os.environ["OLLAMA_HOST"] = "localhost"
os.environ["OLLAMA_PORT"] = str(ollama_port)

print(f"Ollama started on port {ollama_port}")
print("Test environment started (Ollama, etc.)")

# Load the required model
print("Loading Ollama model...")
model_name: str = app_config.generative_llm.model_name
ollama_url: str = f"http://localhost:{ollama_port}"

# Wait for Ollama to be ready
max_retries: int = 30
for attempt in range(max_retries):
    try:
        response = requests.get(f"{ollama_url}/api/tags", timeout=5)
        if response.status_code == 200:
            print("Ollama is ready")
            break
    except Exception as e:
        if attempt < max_retries - 1:
            print(f"Waiting for Ollama to be ready... (attempt {attempt + 1}/{max_retries})")
            time.sleep(2)
        else:
            print(f"Ollama failed to become ready: {e}")
            raise

# Check if model is already loaded
try:
    response = requests.get(f"{ollama_url}/api/tags")
    if response.status_code == 200:
        models = response.json().get("models", [])
        model_names = [m.get("name") for m in models]
        if model_name in model_names:
            print(f"Model {model_name} is already loaded")
        else:
            print(f"Model {model_name} not found, pulling...")
            # Pull the model
            pull_response = requests.post(f"{ollama_url}/api/pull", json={"name": model_name})
            if pull_response.status_code == 200:
                print(f"Model {model_name} pulled successfully")
            else:
                print(f"Failed to pull model: {pull_response.text}")
except Exception as e:
    print(f"Error checking/loading model: {e}")

print("Ollama setup complete!")

INFO:src.config:Using Ollama URL from config file: http://127.0.0.1:11435
Pulling image ollama/ollama:0.5.13
INFO:testcontainers.core.container:Pulling image ollama/ollama:0.5.13
Container started: a3e4e0ef0838
INFO:testcontainers.core.container:Container started: a3e4e0ef0838


Model name: llama3.1:8b
URL: http://127.0.0.1:11435
Starting Ollama test container...
Ollama started on port 11435
Test environment started (Ollama, etc.)
Loading Ollama model...
Waiting for Ollama to be ready... (attempt 1/30)
Ollama is ready
Model llama3.1:8b is already loaded
Ollama setup complete!


In [3]:
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

---

## 1. Manually define one CreditRequest

In [4]:
# Manually define one CreditRequest
credit_request_id = str(uuid.uuid4())
customer_name = "Hotel zur Taube" # TODO: Change customer name
credit_request_metadata = {
    "id": credit_request_id,
    "customer_name": customer_name,
    "request_type": "Immobilienfinanzierung",
    "purpose": "Kauf einer Gewerbeimmobilie",
    "created_at": datetime.now().isoformat(),
    "status": "In Bearbeitung"
}

# Define document information
document_filename = "Kreditantrag_HotelZurTaube.pdf" # TODO: Change document name
document_path = Path("tests/tmp") / document_filename  # Now relative to project root
document_type = "Kreditantrag"
document_id = str(uuid.uuid4())

print(f"CreditRequest defined:")
print(f"   ID: {credit_request_id}")
print(f"   Customer: {customer_name}")
print(f"   Type: {credit_request_metadata['request_type']}")
print(f"   Purpose: {credit_request_metadata['purpose']}")
print(f"   Document ID: {document_id}")
print(f"   Document Path: {document_path}")
print(f"   Document Type: {document_type}")

# Verify document exists
if not document_path.exists():
    raise FileNotFoundError(f"Document not found: {document_path}")
print(f"Document found: {document_path}")

CreditRequest defined:
   ID: e66bc98e-2333-4217-bff7-b21b94594d91
   Customer: Hotel zur Taube
   Type: Immobilienfinanzierung
   Purpose: Kauf einer Gewerbeimmobilie
   Document ID: 81e1a236-7ca1-452b-8ef6-4ade61a71bba
   Document Path: tests/tmp/Kreditantrag_HotelZurTaube.pdf
   Document Type: Kreditantrag
Document found: tests/tmp/Kreditantrag_HotelZurTaube.pdf


---

## 2. Run OCR with Azure

In [5]:
print("Running Azure OCR...")
azure_ocr_result = analyze_single_document_with_azure(str(document_path))
ocr_lines = extract_text_lines_with_bbox_and_confidence(azure_ocr_result)
print(f"OCR completed: {len(ocr_lines)} lines extracted")

INFO:src.ocr.azure_ocr_client:Sending document to Azure OCR: tests/tmp/Kreditantrag_HotelZurTaube.pdf
INFO:azure.core.pipeline.policies.http_logging_policy:Request URL: 'https://free-f0-instance.cognitiveservices.azure.com/formrecognizer/documentModels/prebuilt-document:analyze?stringIndexType=unicodeCodePoint&api-version=2023-07-31'
Request method: 'POST'
Request headers:
    'Content-Type': 'application/octet-stream'
    'Accept': 'application/json'
    'x-ms-client-request-id': '467114a6-567c-11f0-ae5c-ed518bc750bc'
    'User-Agent': 'azsdk-python-ai-formrecognizer/3.3.3 Python/3.10.16 (macOS-15.1-arm64-arm-64bit)'
    'Ocp-Apim-Subscription-Key': 'REDACTED'
A body is sent with the request


Running Azure OCR...


INFO:azure.core.pipeline.policies.http_logging_policy:Response status: 202
Response headers:
    'Content-Length': '0'
    'Operation-Location': 'https://free-f0-instance.cognitiveservices.azure.com/formrecognizer/documentModels/prebuilt-document/analyzeResults/f219603d-601f-42da-a715-b5c440b5c0a9?api-version=2023-07-31'
    'x-envoy-upstream-service-time': '104'
    'apim-request-id': 'f219603d-601f-42da-a715-b5c440b5c0a9'
    'Strict-Transport-Security': 'max-age=31536000; includeSubDomains; preload'
    'x-content-type-options': 'nosniff'
    'x-ms-region': 'East US'
    'Date': 'Tue, 01 Jul 2025 13:07:01 GMT'
INFO:azure.core.pipeline.policies.http_logging_policy:Request URL: 'https://free-f0-instance.cognitiveservices.azure.com/formrecognizer/documentModels/prebuilt-document/analyzeResults/f219603d-601f-42da-a715-b5c440b5c0a9?api-version=2023-07-31'
Request method: 'GET'
Request headers:
    'x-ms-client-request-id': '467114a6-567c-11f0-ae5c-ed518bc750bc'
    'User-Agent': 'azsdk-p

OCR completed: 164 lines extracted


In [6]:
from src.ocr.postprocess import normalize_ocr_lines
normalized_ocr_lines = normalize_ocr_lines(ocr_lines)
print(f"OCR post-processing completed: {len(normalized_ocr_lines)} normalized lines")

OCR post-processing completed: 76 normalized lines


In [7]:
doc_config = DocumentProcessingConfig.from_json("config/document_types.conf")
credit_request_config = doc_config.document_types["credit_request"]

In [8]:
visualization_dir = Path(f"notebooks/docs/{customer_name}")
visualization_dir.mkdir(exist_ok=True)
visualization_path = visualization_dir / f"{document_id}_annotated.png"

In [9]:
visualize_extracted_fields(
    pdf_path=document_path,
    normalized_data=normalized_ocr_lines,
    output_path=visualization_path,
    doc_config=credit_request_config
)
print(f"Visualization generated: {visualization_path}")

INFO:src.visualization.pdf_visualizer:Drew 42 boxes on page 1


Visualization generated: notebooks/docs/Hotel zur Taube/81e1a236-7ca1-452b-8ef6-4ade61a71bba_annotated.png


In [10]:
print("Extracting structured fields from OCR data...")

from src.llm.field_extractor import extract_fields_with_llm
from src.llm.client import OllamaClient
import asyncio
import nest_asyncio

llm_client = OllamaClient(
    base_url=app_config.generative_llm.url,
    model_name=app_config.generative_llm.model_name
)

def run_async_in_notebook(coro):
    try:
        loop = asyncio.get_running_loop()
    except RuntimeError:
        loop = None

    if loop and loop.is_running():
        nest_asyncio.apply()
        return loop.run_until_complete(coro)
    else:
        return asyncio.run(coro)

try:
    extracted_fields_result = run_async_in_notebook(extract_fields_with_llm(
        ocr_lines=normalized_ocr_lines,
        doc_config=credit_request_config,
        llm_client=llm_client,
        original_ocr_lines=ocr_lines
    ))

    extracted_fields = extracted_fields_result.get('extracted_fields', {})
    missing_fields = extracted_fields_result.get('missing_fields', [])
    validation_results = extracted_fields_result.get('validation_results', {})

    # Only keep fields that are actually present and non-empty
    normalized_fields = {
        field_name: field_data['value'] if isinstance(field_data, dict) and 'value' in field_data else str(field_data)
        for field_name, field_data in extracted_fields.items()
        if field_data and (isinstance(field_data, dict) and field_data.get('value')) or (isinstance(field_data, str) and field_data.strip())
    }

    print("\nExtracted fields:")
    for field_name, value in normalized_fields.items():
        print(f"   {field_name}: {value}")

    if missing_fields:
        print(f"\nMissing fields: {missing_fields}")

except Exception as e:
    print(f"LLM extraction failed: {e}")
    normalized_fields = {}
    # Stop and remove the Ollama container
    try:
        ollama_container.stop()
        print("Ollama container stopped and removed successfully")
    except Exception as e:
        print(f"Error stopping/removing container: {e}")
        # Stop and remove the Ollama container
        try:
            ollama_container.stop()
            print("Ollama container stopped and removed successfully")
        except Exception as e:
            print(f"Error stopping/removing container: {e}")

Extracting structured fields from OCR data...

Extracted fields:
   company_name: Hotel zur Taube
   legal_form: Personengesellschaft
   founding_date: 01.01.2010
   business_address: Taubenstraße6, 89264
   commercial_register: 3423431242
   vat_id: 987654
   property_type: Hotel zur Taube
   property_name: Hotel zor Taube
   property_address: Taubenstraße 6, 89264
   purchase_price: 300.000
   requested_amount: 220.000
   purpose: Renovierung
   equity_share: 60.0
   construction_year: 2006
   total_area: 250gm
   loan_amount: 220.000
   term: 50 Monate
   monthly_payment: 5.000
   interest_rate: fest

Missing fields: ['website']


In [11]:
# Prepare OCR Data
extrahierte_daten_data = []
for field_name, field_data in extracted_fields.items():
    if field_data and isinstance(field_data, dict) and field_data.get('value'):
        field_value = field_data['value']
        confidence_score = field_data.get('confidence', 0.5)
        bounding_box = field_data.get('bounding_box')
        page_number = field_data.get('page')
        
        # Convert bounding box to JSON string if it exists
        position_info = json.dumps(bounding_box) if bounding_box else None
        
        extrahierte_daten_data.append({
            "Feldname": field_name,
            "Wert": field_value,
            "Position im Dokument": position_info,
            "Konfidenzscore": confidence_score,
            "FK_Dokument": document_id
        })

In [12]:
print("normalized_fields:", normalized_fields)
print("extrahierte_daten_data:", extrahierte_daten_data)

with pd.ExcelWriter(f"notebooks/docs/{customer_name}/{document_id}_ocr_results.xlsx") as writer:
    pd.DataFrame(extrahierte_daten_data).to_excel(writer, sheet_name="Extrahierte Daten", index=False)

print("Results written to ocr_results.xlsx")

normalized_fields: {'company_name': 'Hotel zur Taube', 'legal_form': 'Personengesellschaft', 'founding_date': '01.01.2010', 'business_address': 'Taubenstraße6, 89264', 'commercial_register': '3423431242', 'vat_id': '987654', 'property_type': 'Hotel zur Taube', 'property_name': 'Hotel zor Taube', 'property_address': 'Taubenstraße 6, 89264', 'purchase_price': '300.000', 'requested_amount': '220.000', 'purpose': 'Renovierung', 'equity_share': '60.0', 'construction_year': '2006', 'total_area': '250gm', 'loan_amount': '220.000', 'term': '50 Monate', 'monthly_payment': '5.000', 'interest_rate': 'fest'}
extrahierte_daten_data: [{'Feldname': 'company_name', 'Wert': 'Hotel zur Taube', 'Position im Dokument': '[{"x": 3.6531, "y": 1.7283}, {"x": 4.8421, "y": 1.7283}, {"x": 4.8421, "y": 1.9049}, {"x": 3.6531, "y": 1.9049}]', 'Konfidenzscore': 0.98, 'FK_Dokument': '81e1a236-7ca1-452b-8ef6-4ade61a71bba'}, {'Feldname': 'legal_form', 'Wert': 'Personengesellschaft', 'Position im Dokument': '[{"x": 3.57

In [13]:
# Stop and remove the Ollama container
try:
    ollama_container.stop()
    print("Ollama container stopped and removed successfully")
except Exception as e:
    print(f"Error stopping/removing container: {e}")

Ollama container stopped and removed successfully
