# Credit OCR System Process

In [1]:
# Standard library imports
import json
import logging
import os
import sys
import time
import uuid
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, Optional
import requests

# Third-party imports
import pandas as pd
import pyodbc
from dotenv import load_dotenv

# Set up Python path and working directory FIRST
project_root = Path.cwd().parent  # Go up one level from notebooks/
sys.path.insert(0, str(project_root))
os.chdir(project_root)  # Change to project root for config loading

# Import backend modules (now that path is set up)
from src.config import AppConfig, DocumentProcessingConfig
from src.llm.client import OllamaClient
from src.llm.field_extractor import extract_fields_with_llm
from src.ocr.azure_ocr_client import analyze_single_document_with_azure
from src.ocr.postprocess import extract_text_lines_with_bbox_and_confidence
from src.visualization.pdf_visualizer import visualize_extracted_fields

In [2]:
# Import required modules with proper error handling
try:
    from testcontainers.core.container import DockerContainer
except ImportError as import_error:
    print(f"Import error: {import_error}")
    print("Please ensure you're running this from the project root directory")
    print(f"Current working directory: {os.getcwd()}")
    print(f"Python path: {sys.path}")
    raise

# Load configuration with simple string path (now from project root)
app_config: AppConfig = AppConfig("config")
print(f"Model name: {app_config.generative_llm.model_name}")
print(f"URL: {app_config.generative_llm.url}")

# Start the generative model with ollama using Docker container (fixed port)
print("Starting Ollama test container...")
ollama_container = DockerContainer("ollama/ollama:latest")
ollama_container.with_exposed_ports(11435)
ollama_container.with_bind_ports(11434, 11435)  # Fixed port mapping
ollama_container.start()
ollama_port = 11435  # Use fixed port instead of dynamic
os.environ["OLLAMA_HOST"] = "localhost"
os.environ["OLLAMA_PORT"] = str(ollama_port)

print(f"Ollama started on port {ollama_port}")
print("Test environment started (Ollama, etc.)")

# Load the required model
print("Loading Ollama model...")
model_name = app_config.generative_llm.model_name
ollama_url = f"http://localhost:{ollama_port}"

# Wait for Ollama to be ready
max_retries = 30
for attempt in range(max_retries):
    try:
        response = requests.get(f"{ollama_url}/api/tags", timeout=5)
        if response.status_code == 200:
            print("Ollama is ready")
            break
    except Exception as e:
        if attempt < max_retries - 1:
            print(f"Waiting for Ollama to be ready... (attempt {attempt + 1}/{max_retries})")
            time.sleep(2)
        else:
            print(f"Ollama failed to become ready: {e}")
            raise

# Check if model is already loaded
try:
    response = requests.get(f"{ollama_url}/api/tags")
    if response.status_code == 200:
        models = response.json().get("models", [])
        model_names = [m.get("name") for m in models]
        if model_name in model_names:
            print(f"Model {model_name} is already loaded")
        else:
            print(f"Model {model_name} not found, pulling...")
            # Pull the model
            pull_response = requests.post(f"{ollama_url}/api/pull", json={"name": model_name})
            if pull_response.status_code == 200:
                print(f"Model {model_name} pulled successfully")
            else:
                print(f"Failed to pull model: {pull_response.text}")
except Exception as e:
    print(f"Error checking/loading model: {e}")

print("Ollama setup complete!")

INFO:src.config:Using Ollama URL from config file: http://127.0.0.1:11435
Pulling image ollama/ollama:latest
INFO:testcontainers.core.container:Pulling image ollama/ollama:latest
Container started: dd322a3d078c
INFO:testcontainers.core.container:Container started: dd322a3d078c


Model name: llama3.1:8b
URL: http://127.0.0.1:11435
Starting Ollama test container...
Ollama started on port 11435
Test environment started (Ollama, etc.)
Loading Ollama model...
Waiting for Ollama to be ready... (attempt 1/30)
Ollama is ready
Model llama3.1:8b not found, pulling...
Model llama3.1:8b pulled successfully
Ollama setup complete!


In [3]:
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [4]:
# Ensure overlay directory exists
overlay_directory = Path("./overlayed_docs")
overlay_directory.mkdir(exist_ok=True)
print(f"Overlay directory: {overlay_directory.absolute()}")

Overlay directory: /Users/markuskuehnle/Documents/projects/credit-ocr-demo-backend/overlayed_docs


---

## 1. Manually define one CreditRequest

In [5]:
# Manually define one CreditRequest
credit_request_id = str(uuid.uuid4())
customer_name = "Demo Tech GmbH"
credit_request_metadata = {
    "id": credit_request_id,
    "customer_name": customer_name,
    "request_type": "Immobilienfinanzierung",
    "purpose": "Kauf einer Gewerbeimmobilie",
    "created_at": datetime.now().isoformat(),
    "status": "In Bearbeitung"
}

# Define document information
document_filename = "sample_creditrequest.pdf"
document_path = Path("tests/tmp") / document_filename  # Now relative to project root
document_type = "Kreditantrag"
document_id = str(uuid.uuid4())

print(f"CreditRequest defined:")
print(f"   ID: {credit_request_id}")
print(f"   Customer: {customer_name}")
print(f"   Type: {credit_request_metadata['request_type']}")
print(f"   Purpose: {credit_request_metadata['purpose']}")
print(f"   Document ID: {document_id}")
print(f"   Document Path: {document_path}")
print(f"   Document Type: {document_type}")

# Verify document exists
if not document_path.exists():
    raise FileNotFoundError(f"Document not found: {document_path}")
print(f"Document found: {document_path}")

CreditRequest defined:
   ID: 4a5debc0-cc13-4fa5-9542-42c366cee92a
   Customer: Demo Tech GmbH
   Type: Immobilienfinanzierung
   Purpose: Kauf einer Gewerbeimmobilie
   Document ID: 0c3a43bf-76f5-4862-89b8-fc133ac405da
   Document Path: tests/tmp/sample_creditrequest.pdf
   Document Type: Kreditantrag
Document found: tests/tmp/sample_creditrequest.pdf


---

## 2. Run OCR with Azure

In [6]:
print("Running Azure OCR...")
azure_ocr_result = analyze_single_document_with_azure(str(document_path))
ocr_lines = extract_text_lines_with_bbox_and_confidence(azure_ocr_result)
print(f"OCR completed: {len(ocr_lines)} lines extracted")

INFO:src.ocr.azure_ocr_client:Sending document to Azure OCR: tests/tmp/sample_creditrequest.pdf
INFO:azure.core.pipeline.policies.http_logging_policy:Request URL: 'https://free-f0-instance.cognitiveservices.azure.com/formrecognizer/documentModels/prebuilt-document:analyze?stringIndexType=unicodeCodePoint&api-version=2023-07-31'
Request method: 'POST'
Request headers:
    'Content-Type': 'application/octet-stream'
    'Accept': 'application/json'
    'x-ms-client-request-id': 'd01c2af6-55be-11f0-b934-930f53017094'
    'User-Agent': 'azsdk-python-ai-formrecognizer/3.3.3 Python/3.10.16 (macOS-15.1-arm64-arm-64bit)'
    'Ocp-Apim-Subscription-Key': 'REDACTED'
A body is sent with the request


Running Azure OCR...


INFO:azure.core.pipeline.policies.http_logging_policy:Response status: 202
Response headers:
    'Content-Length': '0'
    'Operation-Location': 'https://free-f0-instance.cognitiveservices.azure.com/formrecognizer/documentModels/prebuilt-document/analyzeResults/cb6d1d19-97b8-4b22-9683-267fba6f0c38?api-version=2023-07-31'
    'x-envoy-upstream-service-time': '74'
    'apim-request-id': 'cb6d1d19-97b8-4b22-9683-267fba6f0c38'
    'Strict-Transport-Security': 'max-age=31536000; includeSubDomains; preload'
    'x-content-type-options': 'nosniff'
    'x-ms-region': 'East US'
    'Date': 'Mon, 30 Jun 2025 14:30:47 GMT'
INFO:azure.core.pipeline.policies.http_logging_policy:Request URL: 'https://free-f0-instance.cognitiveservices.azure.com/formrecognizer/documentModels/prebuilt-document/analyzeResults/cb6d1d19-97b8-4b22-9683-267fba6f0c38?api-version=2023-07-31'
Request method: 'GET'
Request headers:
    'x-ms-client-request-id': 'd01c2af6-55be-11f0-b934-930f53017094'
    'User-Agent': 'azsdk-py

OCR completed: 193 lines extracted


In [7]:
from src.ocr.postprocess import normalize_ocr_lines
normalized_ocr_lines = normalize_ocr_lines(ocr_lines)
print(f"OCR post-processing completed: {len(normalized_ocr_lines)} normalized lines")

OCR post-processing completed: 79 normalized lines


In [8]:
doc_config = DocumentProcessingConfig.from_json("config/document_types.conf")
credit_request_config = doc_config.document_types["credit_request"]

In [9]:
visualization_dir = Path("overlayed_docs")
visualization_dir.mkdir(exist_ok=True)
visualization_path = visualization_dir / f"{document_id}_annotated.png"

In [10]:
visualize_extracted_fields(
    pdf_path=document_path,
    normalized_data=normalized_ocr_lines,
    output_path=visualization_path,
    doc_config=credit_request_config
)
print(f"Visualization generated: {visualization_path}")

INFO:src.visualization.pdf_visualizer:Drew 44 boxes on page 1
INFO:src.visualization.pdf_visualizer:Drew 0 boxes on page 2


Visualization generated: overlayed_docs/0c3a43bf-76f5-4862-89b8-fc133ac405da_annotated.png


In [11]:
print("Extracting structured fields from OCR data...")

from src.llm.field_extractor import extract_fields_with_llm
from src.llm.client import OllamaClient
import asyncio
import nest_asyncio

llm_client = OllamaClient(
    base_url=app_config.generative_llm.url,
    model_name=app_config.generative_llm.model_name
)

def run_async_in_notebook(coro):
    try:
        loop = asyncio.get_running_loop()
    except RuntimeError:
        loop = None

    if loop and loop.is_running():
        nest_asyncio.apply()
        return loop.run_until_complete(coro)
    else:
        return asyncio.run(coro)

try:
    extracted_fields_result = run_async_in_notebook(extract_fields_with_llm(
        ocr_lines=normalized_ocr_lines,
        doc_config=credit_request_config,
        llm_client=llm_client,
        original_ocr_lines=ocr_lines
    ))

    extracted_fields = extracted_fields_result.get('extracted_fields', {})
    missing_fields = extracted_fields_result.get('missing_fields', [])
    validation_results = extracted_fields_result.get('validation_results', {})

    # Only keep fields that are actually present and non-empty
    normalized_fields = {
        field_name: field_data['value'] if isinstance(field_data, dict) and 'value' in field_data else str(field_data)
        for field_name, field_data in extracted_fields.items()
        if field_data and (isinstance(field_data, dict) and field_data.get('value')) or (isinstance(field_data, str) and field_data.strip())
    }

    print("\nExtracted fields:")
    for field_name, value in normalized_fields.items():
        print(f"   {field_name}: {value}")

    if missing_fields:
        print(f"\nMissing fields: {missing_fields}")

except Exception as e:
    print(f"LLM extraction failed: {e}")
    normalized_fields = {}
    # Stop and remove the Ollama container
    try:
        ollama_container.stop()
        print("Ollama container stopped and removed successfully")
    except Exception as e:
        print(f"Error stopping/removing container: {e}")
        # Stop and remove the Ollama container
        try:
            ollama_container.stop()
            print("Ollama container stopped and removed successfully")
        except Exception as e:
            print(f"Error stopping/removing container: {e}")

Extracting structured fields from OCR data...

Extracted fields:
   company_name: Demo Tech GmbH
   legal_form: Gesellschaft mit beschränkter Haftung (GmbH)
   founding_date: 15.03.2018
   business_address: Hauptstraße 123, 70173 Stuttgart
   commercial_register: HRB 123456 / Amtsgericht Stuttgart
   property_type: Gewerbeimmobilie - Bürogebäude
   property_name: InnovationsCampus Stuttgart
   property_address: Innovationsntraße 1, 70469 Stuttgart
   purchase_price: 4.200.000 €
   requested_amount: 3.500.000 €
   purpose: Kauf und Renovierung
   equity_share: 700.000 €
   construction_year: 1995
   total_area: 2.800 m2
   loan_amount: 3.500.000 €
   term: 20 Jahre
   monthly_payment: Ca. 18.000 € (monatlich)
   interest_rate: Festzins, 3.2% p.a.
   early_repayment: [x] ja [ ] nein
   public_funding: [ ] ja [x] nein

Missing fields: ['vat_id', 'website']


In [12]:
# 1. Prepare Kreditantrag data (as a list of dicts, even if only one row)
kreditantrag_data = []
if all(normalized_fields.get(f) for f in ["loan_amount", "term", "purpose", "company_name"]):
    kreditantrag_data.append({
        "Antragsstatus": "In Bearbeitung",
        "Texterkennungsstatus": "Erkannt",
        "Kreditbetrag": normalized_fields.get('loan_amount'),
        "Laufzeit": normalized_fields.get('term'),
        "Zinsart": normalized_fields.get('interest_type', "Festzins"),
        "Zinssatz": normalized_fields.get('interest_rate'),
        "Sondertilgung": normalized_fields.get('early_repayment'),
        "Verwendungszweck": normalized_fields.get('purpose'),
        "WeitereInformationen": f"Customer: {normalized_fields.get('company_name')}, Purpose: {normalized_fields.get('purpose')}",
        "KredittypID": normalized_fields.get('kredittyp_id'),
        "Status": "In Bearbeitung",
        "CustomerName": normalized_fields.get('company_name'),
        "RequestType": normalized_fields.get('request_type'),
        "Purpose": normalized_fields.get('purpose')
    })

# 2. Prepare Dokument data
dokument_data = []
if kreditantrag_data:
    dokument_data.append({
        "Dokumententyp": normalized_fields.get("document_type", "Kreditantrag"),
        "Pfad DMS": f"overlayed_docs/{document_id}_annotated.png",
        "FK_Kreditantrag": credit_request_id
    })

# 3. Prepare Extrahierte Daten data
extrahierte_daten_data = []
for field_name, field_data in extracted_fields.items():
    if field_data and isinstance(field_data, dict) and field_data.get('value'):
        field_value = field_data['value']
        confidence_score = field_data.get('confidence', 0.5)
        bounding_box = field_data.get('bounding_box')
        page_number = field_data.get('page')
        
        # Convert bounding box to JSON string if it exists
        position_info = json.dumps(bounding_box) if bounding_box else None
        
        extrahierte_daten_data.append({
            "Feldname": field_name,
            "Wert": field_value,
            "Position im Dokument": position_info,
            "Konfidenzscore": confidence_score,
            "FK_Dokument": document_id
        })

# 4. Write to Excel
with pd.ExcelWriter("ocr_results.xlsx") as writer:
    pd.DataFrame(kreditantrag_data).to_excel(writer, sheet_name="Kreditantrag", index=False)
    pd.DataFrame(dokument_data).to_excel(writer, sheet_name="Dokument", index=False)
    pd.DataFrame(extrahierte_daten_data).to_excel(writer, sheet_name="Extrahierte Daten", index=False)

print("Results written to ocr_results.xlsx")

Results written to ocr_results.xlsx


In [13]:
print("normalized_fields:", normalized_fields)
print("kreditantrag_data:", kreditantrag_data)
print("dokument_data:", dokument_data)
print("extrahierte_daten_data:", extrahierte_daten_data)

with pd.ExcelWriter("ocr_results.xlsx") as writer:
    pd.DataFrame(kreditantrag_data).to_excel(writer, sheet_name="Kreditantrag", index=False)
    pd.DataFrame(dokument_data).to_excel(writer, sheet_name="Dokument", index=False)
    pd.DataFrame(extrahierte_daten_data).to_excel(writer, sheet_name="Extrahierte Daten", index=False)

normalized_fields: {'company_name': 'Demo Tech GmbH', 'legal_form': 'Gesellschaft mit beschränkter Haftung (GmbH)', 'founding_date': '15.03.2018', 'business_address': 'Hauptstraße 123, 70173 Stuttgart', 'commercial_register': 'HRB 123456 / Amtsgericht Stuttgart', 'property_type': 'Gewerbeimmobilie - Bürogebäude', 'property_name': 'InnovationsCampus Stuttgart', 'property_address': 'Innovationsntraße 1, 70469 Stuttgart', 'purchase_price': '4.200.000 €', 'requested_amount': '3.500.000 €', 'purpose': 'Kauf und Renovierung', 'equity_share': '700.000 €', 'construction_year': '1995', 'total_area': '2.800 m2', 'loan_amount': '3.500.000 €', 'term': '20 Jahre', 'monthly_payment': 'Ca. 18.000 € (monatlich)', 'interest_rate': 'Festzins, 3.2% p.a.', 'early_repayment': '[x] ja [ ] nein', 'public_funding': '[ ] ja [x] nein'}
kreditantrag_data: [{'Antragsstatus': 'In Bearbeitung', 'Texterkennungsstatus': 'Erkannt', 'Kreditbetrag': '3.500.000 €', 'Laufzeit': '20 Jahre', 'Zinsart': 'Festzins', 'Zinssatz

In [14]:
# 1. Prepare Kreditantrag data (as a list of dicts, even if only one row)
kreditantrag_data = []
if all(normalized_fields.get(f) for f in ["loan_amount", "term", "purpose", "company_name"]):
    kreditantrag_data.append({
        "Antragsstatus": "In Bearbeitung",
        "Texterkennungsstatus": "Erkannt",
        "Kreditbetrag": normalized_fields.get('loan_amount'),
        "Laufzeit": normalized_fields.get('term'),
        "Zinsart": normalized_fields.get('interest_type', "Festzins"),
        "Zinssatz": normalized_fields.get('interest_rate'),
        "Sondertilgung": normalized_fields.get('early_repayment'),
        "Verwendungszweck": normalized_fields.get('purpose'),
        "WeitereInformationen": f"Customer: {normalized_fields.get('company_name')}, Purpose: {normalized_fields.get('purpose')}",
        "KredittypID": normalized_fields.get('kredittyp_id'),
        "Status": "In Bearbeitung",
        "CustomerName": normalized_fields.get('company_name'),
        "RequestType": normalized_fields.get('request_type'),
        "Purpose": normalized_fields.get('purpose')
    })

# 2. Prepare Dokument data
dokument_data = []
if kreditantrag_data:
    dokument_data.append({
        "Dokumententyp": normalized_fields.get("document_type", "Kreditantrag"),
        "Pfad DMS": f"overlayed_docs/{document_id}_annotated.png",
        "FK_Kreditantrag": credit_request_id
    })

# 3. Prepare Extrahierte Daten data
extrahierte_daten_data = []
# Initialize extracted_fields if not available from LLM
if 'extracted_fields' not in locals() or not extracted_fields:
    extracted_fields = {}
    print("Warning: No extracted fields available from LLM")

extrahierte_daten_data = []
for field_name, field_data in extracted_fields.items():
    if field_data and isinstance(field_data, dict) and field_data.get('value'):
        field_value = field_data['value']
        confidence_score = field_data.get('confidence', 0.5)
        bounding_box = field_data.get('bounding_box')
        page_number = field_data.get('page')
        
        # Convert bounding box to JSON string if it exists
        position_info = json.dumps(bounding_box) if bounding_box else None
        
        extrahierte_daten_data.append({
            "Feldname": field_name,
            "Wert": field_value,
            "Position im Dokument": position_info,
            "Konfidenzscore": confidence_score,
            "FK_Dokument": document_id
        })

# 4. Write to Excel
with pd.ExcelWriter("ocr_results.xlsx") as writer:
    pd.DataFrame(kreditantrag_data).to_excel(writer, sheet_name="Kreditantrag", index=False)
    pd.DataFrame(dokument_data).to_excel(writer, sheet_name="Dokument", index=False)
    pd.DataFrame(extrahierte_daten_data).to_excel(writer, sheet_name="Extrahierte Daten", index=False)

print("Results written to ocr_results.xlsx")
print("normalized_fields:", normalized_fields)
print("kreditantrag_data:", kreditantrag_data)
print("dokument_data:", dokument_data)
print("extrahierte_daten_data:", extrahierte_daten_data)

with pd.ExcelWriter("ocr_results.xlsx") as writer:
    pd.DataFrame(kreditantrag_data).to_excel(writer, sheet_name="Kreditantrag", index=False)
    pd.DataFrame(dokument_data).to_excel(writer, sheet_name="Dokument", index=False)
    pd.DataFrame(extrahierte_daten_data).to_excel(writer, sheet_name="Extrahierte Daten", index=False)

Results written to ocr_results.xlsx
normalized_fields: {'company_name': 'Demo Tech GmbH', 'legal_form': 'Gesellschaft mit beschränkter Haftung (GmbH)', 'founding_date': '15.03.2018', 'business_address': 'Hauptstraße 123, 70173 Stuttgart', 'commercial_register': 'HRB 123456 / Amtsgericht Stuttgart', 'property_type': 'Gewerbeimmobilie - Bürogebäude', 'property_name': 'InnovationsCampus Stuttgart', 'property_address': 'Innovationsntraße 1, 70469 Stuttgart', 'purchase_price': '4.200.000 €', 'requested_amount': '3.500.000 €', 'purpose': 'Kauf und Renovierung', 'equity_share': '700.000 €', 'construction_year': '1995', 'total_area': '2.800 m2', 'loan_amount': '3.500.000 €', 'term': '20 Jahre', 'monthly_payment': 'Ca. 18.000 € (monatlich)', 'interest_rate': 'Festzins, 3.2% p.a.', 'early_repayment': '[x] ja [ ] nein', 'public_funding': '[ ] ja [x] nein'}
kreditantrag_data: [{'Antragsstatus': 'In Bearbeitung', 'Texterkennungsstatus': 'Erkannt', 'Kreditbetrag': '3.500.000 €', 'Laufzeit': '20 Jahr

In [27]:
# Stop and remove the Ollama container
try:
    ollama_container.stop()
    print("Ollama container stopped and removed successfully")
except Exception as e:
    print(f"Error stopping/removing container: {e}")

Error stopping/removing container: 404 Client Error for http+docker://localhost/v1.48/containers/dd322a3d078ca265b0f4393e3f32c3565424bdecad4914f4574365c9ee2aae5a?v=True&link=False&force=True: Not Found ("No such container: dd322a3d078ca265b0f4393e3f32c3565424bdecad4914f4574365c9ee2aae5a")


---

## 3. Azure DB Connection

In [16]:
# # Load credentials
# load_dotenv("azure.env")

# server = os.getenv("SQL_SERVER")
# database = os.getenv("DB_NAME")
# username = os.getenv("DB_USER")
# password = os.getenv("DB_PASSWORD")

In [17]:
# # Minimal connection string (adjust as needed)
# conn_str = (
#     'DRIVER={ODBC Driver 17 for SQL Server};'
#     f'SERVER={server};'
#     f'DATABASE={database};'
#     f'UID={username};'
#     f'PWD={password}'
# )

In [18]:
# try:
#     conn = pyodbc.connect(conn_str, timeout=5)
#     cursor = conn.cursor()
#     cursor.execute("SELECT 1")
#     print("Connection successful.")
# except Exception as e:
#     print("Connection failed:", e)

---

In [19]:
# # ============================================================================
# # �� DATABASE SCHEMA INSPECTION
# # ============================================================================
# # Check which tables and entities are already available in the database

# print("Checking available database tables and entities...")

# # Check existing tables
# check_tables_sql = """
# SELECT 
#     TABLE_SCHEMA,
#     TABLE_NAME,
#     TABLE_TYPE
# FROM INFORMATION_SCHEMA.TABLES 
# WHERE TABLE_TYPE = 'BASE TABLE'
# ORDER BY TABLE_SCHEMA, TABLE_NAME
# """

# try:
#     cursor.execute(check_tables_sql)
#     tables = cursor.fetchall()
    
#     print(f"Found {len(tables)} tables in the database:")
#     for table in tables:
#         schema, name, table_type = table
#         print(f"   {schema}.{name} ({table_type})")
        
# except Exception as e:
#     print(f"Failed to check tables: {e}")

# # Check Kreditantrag table structure if it exists
# print("\nChecking Kreditantrag table structure...")
# check_kreditantrag_columns_sql = """
# SELECT 
#     COLUMN_NAME,
#     DATA_TYPE,
#     IS_NULLABLE,
#     COLUMN_DEFAULT
# FROM INFORMATION_SCHEMA.COLUMNS 
# WHERE TABLE_NAME = 'Kreditantrag' 
# ORDER BY ORDINAL_POSITION
# """

# try:
#     cursor.execute(check_kreditantrag_columns_sql)
#     columns = cursor.fetchall()
    
#     if columns:
#         print(f"Kreditantrag table has {len(columns)} columns:")
#         for column in columns:
#             name, data_type, nullable, default_val = column
#             print(f"   {name}: {data_type} (nullable: {nullable}, default: {default_val})")
#     else:
#         print("Kreditantrag table not found")
        
# except Exception as e:
#     print(f"Failed to check Kreditantrag columns: {e}")

# # Check Dokument table structure if it exists
# print("\nChecking Dokument table structure...")
# check_dokument_columns_sql = """
# SELECT 
#     COLUMN_NAME,
#     DATA_TYPE,
#     IS_NULLABLE,
#     COLUMN_DEFAULT
# FROM INFORMATION_SCHEMA.COLUMNS 
# WHERE TABLE_NAME = 'Dokument' 
# ORDER BY ORDINAL_POSITION
# """

# try:
#     cursor.execute(check_dokument_columns_sql)
#     columns = cursor.fetchall()
    
#     if columns:
#         print(f"Dokument table has {len(columns)} columns:")
#         for column in columns:
#             name, data_type, nullable, default_val = column
#             print(f"   {name}: {data_type} (nullable: {nullable}, default: {default_val})")
#     else:
#         print("Dokument table not found")
        
# except Exception as e:
#     print(f"Failed to check Dokument columns: {e}")

# # Check Extrahierte Daten table structure if it exists
# print("\nChecking Extrahierte Daten table structure...")
# check_extrahierte_daten_columns_sql = """
# SELECT 
#     COLUMN_NAME,
#     DATA_TYPE,
#     IS_NULLABLE,
#     COLUMN_DEFAULT
# FROM INFORMATION_SCHEMA.COLUMNS 
# WHERE TABLE_NAME = 'Extrahierte Daten' 
# ORDER BY ORDINAL_POSITION
# """

# try:
#     cursor.execute(check_extrahierte_daten_columns_sql)
#     columns = cursor.fetchall()
    
#     if columns:
#         print(f"Extrahierte Daten table has {len(columns)} columns:")
#         for column in columns:
#             name, data_type, nullable, default_val = column
#             print(f"   {name}: {data_type} (nullable: {nullable}, default: {default_val})")
#     else:
#         print("Extrahierte Daten table not found")
        
# except Exception as e:
#     print(f"Failed to check Extrahierte Daten columns: {e}")

# # Check foreign key relationships
# print("\nChecking foreign key relationships...")
# check_foreign_keys_sql = """
# SELECT 
#     fk.name AS FK_Name,
#     OBJECT_NAME(fk.parent_object_id) AS Table_Name,
#     COL_NAME(fkc.parent_object_id, fkc.parent_column_id) AS Column_Name,
#     OBJECT_NAME(fk.referenced_object_id) AS Referenced_Table_Name,
#     COL_NAME(fkc.referenced_object_id, fkc.referenced_column_id) AS Referenced_Column_Name
# FROM sys.foreign_keys fk
# INNER JOIN sys.foreign_key_columns fkc ON fk.object_id = fkc.constraint_object_id
# WHERE OBJECT_NAME(fk.parent_object_id) IN ('Kreditantrag', 'Dokument', 'Extrahierte Daten')
# ORDER BY Table_Name, Column_Name
# """

# try:
#     cursor.execute(check_foreign_keys_sql)
#     foreign_keys = cursor.fetchall()
    
#     if foreign_keys:
#         print(f"Found {len(foreign_keys)} foreign key relationships:")
#         for fk in foreign_keys:
#             fk_name, table_name, column_name, ref_table, ref_column = fk
#             print(f"   {table_name}.{column_name} → {ref_table}.{ref_column} ({fk_name})")
#     else:
#         print("No foreign key relationships found for these tables")
        
# except Exception as e:
#     print(f"Failed to check foreign keys: {e}")

# print("\n" + "="*60)
# print("DATABASE SCHEMA SUMMARY")
# print("="*60)

In [20]:
# print("Checking contents of all tables...")

# # Check Kunde table contents
# print("\nKunde table contents:")
# try:
#     cursor.execute("SELECT TOP 5 * FROM dbo.Kunde")
#     kunde_data = cursor.fetchall()
#     if kunde_data:
#         print(f"Found {len(kunde_data)} customer records (showing first 5):")
#         for row in kunde_data:
#             print(f"   {row}")
#     else:
#         print("No customer records found")
# except Exception as e:
#     print(f"Failed to check Kunde table: {e}")

# # Check Kredittyp table contents
# print("\nKredittyp table contents:")
# try:
#     cursor.execute("SELECT * FROM dbo.Kredittyp")
#     kredittyp_data = cursor.fetchall()
#     if kredittyp_data:
#         print(f"Found {len(kredittyp_data)} credit type records:")
#         for row in kredittyp_data:
#             print(f"   {row}")
#     else:
#         print("No credit type records found")
# except Exception as e:
#     print(f"Failed to check Kredittyp table: {e}")

# # Check Mitarbeiter table contents
# print("\nMitarbeiter table contents:")
# try:
#     cursor.execute("SELECT TOP 5 * FROM dbo.Mitarbeiter")
#     mitarbeiter_data = cursor.fetchall()
#     if mitarbeiter_data:
#         print(f"Found {len(mitarbeiter_data)} employee records (showing first 5):")
#         for row in mitarbeiter_data:
#             print(f"   {row}")
#     else:
#         print("No employee records found")
# except Exception as e:
#     print(f"Failed to check Mitarbeiter table: {e}")

# # Check existing Kreditantrag records
# print("\nKreditantrag table contents:")
# try:
#     cursor.execute("SELECT TOP 5 * FROM dbo.Kreditantrag")
#     kreditantrag_data = cursor.fetchall()
#     if kreditantrag_data:
#         print(f"Found {len(kreditantrag_data)} credit application records (showing first 5):")
#         for row in kreditantrag_data:
#             print(f"   {row}")
#     else:
#         print("No credit application records found")
# except Exception as e:
#     print(f"Failed to check Kreditantrag table: {e}")

# # Check existing Dokument records
# print("\nDokument table contents:")
# try:
#     cursor.execute("SELECT TOP 5 * FROM dbo.Dokument")
#     dokument_data = cursor.fetchall()
#     if dokument_data:
#         print(f"Found {len(dokument_data)} document records (showing first 5):")
#         for row in dokument_data:
#             print(f"   {row}")
#     else:
#         print("No document records found")
# except Exception as e:
#     print(f"Failed to check Dokument table: {e}")

# # Check existing Extrahierte Daten records
# print("\nExtrahierte Daten table contents:")
# try:
#     cursor.execute("SELECT TOP 5 * FROM dbo.[Extrahierte Daten]")
#     extrahierte_daten_data = cursor.fetchall()
#     if extrahierte_daten_data:
#         print(f"Found {len(extrahierte_daten_data)} extracted data records (showing first 5):")
#         for row in extrahierte_daten_data:
#             print(f"   {row}")
#     else:
#         print("No extracted data records found")
# except Exception as e:
#     print(f"Failed to check Extrahierte Daten table: {e}")

# # Get column names for better readability
# print("\nGetting column names for better data display...")

# try:
#     # Kunde columns
#     cursor.execute("SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = 'Kunde' ORDER BY ORDINAL_POSITION")
#     kunde_columns = [col[0] for col in cursor.fetchall()]
#     print(f"Kunde columns: {kunde_columns}")
    
#     # Kredittyp columns
#     cursor.execute("SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = 'Kredittyp' ORDER BY ORDINAL_POSITION")
#     kredittyp_columns = [col[0] for col in cursor.fetchall()]
#     print(f"Kredittyp columns: {kredittyp_columns}")
    
#     # Mitarbeiter columns
#     cursor.execute("SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = 'Mitarbeiter' ORDER BY ORDINAL_POSITION")
#     mitarbeiter_columns = [col[0] for col in cursor.fetchall()]
#     print(f"Mitarbeiter columns: {mitarbeiter_columns}")
    
# except Exception as e:
#     print(f"Failed to get column names: {e}")

In [21]:
# # 1. Insert Kreditantrag row
# print("\nInserting Kreditantrag row...")

# required_fields = ["loan_amount", "term", "purpose", "company_name"]
# if all(normalized_fields.get(f) for f in required_fields):
#     insert_kreditantrag_sql = """
#     INSERT INTO Kreditantrag (
#         Antragsstatus,
#         Texterkennungsstatus,
#         Kreditbetrag,
#         Laufzeit,
#         Zinsart,
#         Zinssatz,
#         Sondertilgung,
#         Verwendungszweck,
#         WeitereInformationen,
#         KredittypID,
#         Status,
#         CustomerName,
#         RequestType,
#         Purpose
#     )
#     OUTPUT INSERTED.AntragID
#     VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
#     """

#     try:
#         kreditbetrag = float(normalized_fields.get('loan_amount'))
#         laufzeit = normalized_fields.get('term')
#         zinssatz = float(normalized_fields.get('interest_rate')) if normalized_fields.get('interest_rate') else None
#         zinssatz_typ = normalized_fields.get('interest_type', "Festzins")
#         sondertilgung = bool(normalized_fields.get('early_repayment')) if normalized_fields.get('early_repayment') is not None else False
#         verwendungszweck = normalized_fields.get('purpose')
#         customer_name = normalized_fields.get('company_name')
#         request_type = normalized_fields.get('request_type')
#         purpose = verwendungszweck
#         weitere_info = f"Customer: {customer_name}, Type: {request_type}, Purpose: {purpose}"

#         # Only use KredittypID if you have a real one, else set to None
#         kredittyp_id = normalized_fields.get('kredittyp_id')
#         if kredittyp_id is not None:
#             kredittyp_id = int(kredittyp_id)
#         else:
#             kredittyp_id = None

#         cursor.execute(
#             insert_kreditantrag_sql,
#             "In Bearbeitung",
#             "Erkannt",
#             kreditbetrag,
#             laufzeit,
#             zinssatz_typ,
#             zinssatz,
#             int(sondertilgung),
#             verwendungszweck,
#             weitere_info,
#             kredittyp_id,
#             "In Bearbeitung",
#             customer_name,
#             request_type,
#             purpose
#         )

#         result = cursor.fetchone()
#         if result and result[0] is not None:
#             antrag_id = int(result[0])
#             conn.commit()
#             print(f"✓ Kreditantrag inserted with ID: {antrag_id}")
#         else:
#             raise ValueError("No ID returned from INSERT")

#     except Exception as e:
#         print(f"Failed to insert Kreditantrag: {e}")
#         conn.rollback()
#         antrag_id = None
# else:
#     print("Skipping Kreditantrag insert: not all required fields present.")
#     antrag_id = None

In [22]:
# # 2. Insert Dokument row
# if antrag_id and document_type and document_path:
#     print("\nInserting Dokument row...")

#     insert_dokument_sql = """
#     INSERT INTO dbo.Dokument (
#         [Dokumententyp], [Pfad DMS], [FK_Kreditantrag]
#     ) VALUES (?, ?, ?)
#     """

#     try:
#         document_path_dms = f"overlayed_docs/{document_id}_annotated.png"

#         cursor.execute(insert_dokument_sql, (
#             document_type,
#             document_path_dms,
#             antrag_id
#         ))

#         cursor.execute("SELECT SCOPE_IDENTITY()")
#         dokument_id = int(cursor.fetchone()[0])

#         conn.commit()
#         print(f"Dokument inserted with ID: {dokument_id}")

#     except Exception as e:
#         print(f"Failed to insert Dokument: {e}")
#         conn.rollback()
#         dokument_id = None
# else:
#     print("Skipping Dokument insertion - missing Antrag ID or document metadata")
#     dokument_id = None

In [23]:
# # 3. Insert Extrahierte Daten rows
# if antrag_id and dokument_id:
#     print("\nInserting Extrahierte Daten rows...")

#     insert_extrahierte_daten_sql = """
#     INSERT INTO dbo.[Extrahierte Daten] (
#         [Feldname], [Wert], [Position im Dokument], [Konfidenzscore], [FK_Dokument]
#     ) VALUES (?, ?, ?, ?, ?)
#     """

#     try:
#         fields_inserted_count = 0

#         for field_name, field_value in normalized_fields.items():
#             if field_value is not None and str(field_value).strip():
#                 position_info = None
#                 confidence_score = None

#                 for line in normalized_ocr_lines:
#                     if field_name.lower() in line.get("text", "").lower() or str(field_value).lower() in line.get("text", "").lower():
#                         if "bounding_box" in line:
#                             position_info = json.dumps(line["bounding_box"])
#                         if "confidence" in line:
#                             confidence_score = float(line["confidence"])
#                         break

#                 cursor.execute(insert_extrahierte_daten_sql, (
#                     field_name,
#                     str(field_value),
#                     position_info,
#                     confidence_score,
#                     dokument_id
#                 ))
#                 fields_inserted_count += 1

#         conn.commit()
#         print(f"✓ {fields_inserted_count} extracted fields inserted")

#     except Exception as e:
#         print(f"✗ Failed to insert Extrahierte Daten: {e}")
#         conn.rollback()
# else:
#     print("Skipping Extrahierte Daten insertion - missing Antrag ID or Dokument ID")

In [24]:
# # Validation queries
# print("\nPerforming validation queries...")

# try:
#     # Count Kreditantrag records
#     cursor.execute("SELECT COUNT(*) FROM dbo.Kreditantrag WHERE [Antrag ID] = ?", (antrag_id,))
#     kreditantrag_count = cursor.fetchone()[0]
#     print(f"Kreditantrag records with ID {antrag_id}: {kreditantrag_count}")
    
#     # Count Dokument records
#     cursor.execute("SELECT COUNT(*) FROM dbo.Dokument WHERE [Dokumenten ID] = ?", (dokument_id,))
#     dokument_count = cursor.fetchone()[0]
#     print(f"Dokument records with ID {dokument_id}: {dokument_count}")
    
#     # Count Extrahierte Daten records
#     cursor.execute("SELECT COUNT(*) FROM dbo.[Extrahierte Daten] WHERE [FK_Dokument] = ?", (dokument_id,))
#     extrahierte_daten_count = cursor.fetchone()[0]
#     print(f"Extrahierte Daten records for document {dokument_id}: {extrahierte_daten_count}")
    
#     # Show sample data
#     print("\nSample Kreditantrag data:")
#     cursor.execute("""
#         SELECT [Antrag ID], [ErstelltAm], [Antragsstatus], [Kreditbetrag], [Laufzeit], [Zinssatz], [Antragsstatus]
#         FROM dbo.Kreditantrag WHERE [Antrag ID] = ?
#     """, (antrag_id,))
#     kreditantrag_data = cursor.fetchone()
#     if kreditantrag_data:
#         print(f"   Antrag ID: {kreditantrag_data[0]}")
#         print(f"   ErstelltAm: {kreditantrag_data[1]}")
#         print(f"   Antragsstatus: {kreditantrag_data[2]}")
#         print(f"   Kreditbetrag: {kreditantrag_data[3]}")
#         print(f"   Laufzeit: {kreditantrag_data[4]}")
#         print(f"   Zinssatz: {kreditantrag_data[5]}")
#         print(f"   Status: {kreditantrag_data[6]}")
    
#     print("\nSample Dokument data:")
#     cursor.execute("""
#         SELECT [Dokumenten ID], [Dokumententyp], [Pfad DMS], [FK_Kreditantrag]
#         FROM dbo.Dokument WHERE [Dokumenten ID] = ?
#     """, (dokument_id,))
#     dokument_data = cursor.fetchone()
#     if dokument_data:
#         print(f"   Dokumenten ID: {dokument_data[0]}")
#         print(f"   Dokumententyp: {dokument_data[1]}")
#         print(f"   Pfad DMS: {dokument_data[2]}")
#         print(f"   FK_Kreditantrag: {dokument_data[3]}")
    
#     print("\nSample Extrahierte Daten:")
#     cursor.execute("""
#         SELECT [Feldname], [Wert], [Konfidenzscore]
#         FROM dbo.[Extrahierte Daten] WHERE [FK_Dokument] = ?
#         ORDER BY [Feldname]
#     """, (dokument_id,))
#     extrahierte_daten = cursor.fetchall()
#     for field_data in extrahierte_daten[:5]:  # Show first 5 fields
#         print(f"   {field_data[0]}: {field_data[1]} (confidence: {field_data[2]})")
    
# except Exception as e:
#     print(f"Validation queries failed: {e}")

# print("\nDatabase insertion completed successfully!")

In [25]:
# # Validation queries
# print("\nPerforming validation queries...")

# try:
#     # Count Kreditantrag records
#     cursor.execute("SELECT COUNT(*) FROM dbo.Kreditantrag WHERE [Antrag ID] = ?", (antrag_id,))
#     kreditantrag_count = cursor.fetchone()[0]
#     print(f"Kreditantrag records with ID {antrag_id}: {kreditantrag_count}")
    
#     # Count Dokument records
#     cursor.execute("SELECT COUNT(*) FROM dbo.Dokument WHERE [Dokumenten ID] = ?", (dokument_id,))
#     dokument_count = cursor.fetchone()[0]
#     print(f"Dokument records with ID {dokument_id}: {dokument_count}")
    
#     # Count Extrahierte Daten records
#     cursor.execute("SELECT COUNT(*) FROM dbo.[Extrahierte Daten] WHERE [FK_Dokument] = ?", (dokument_id,))
#     extrahierte_daten_count = cursor.fetchone()[0]
#     print(f"Extrahierte Daten records for document {dokument_id}: {extrahierte_daten_count}")
    
#     # Show sample data
#     print("\nSample Kreditantrag data:")
#     cursor.execute("""
#         SELECT [Antrag ID], [ErstelltAm], [Antragsstatus], [Kreditbetrag], [Laufzeit], [Zinssatz], [Antragsstatus]
#         FROM dbo.Kreditantrag WHERE [Antrag ID] = ?
#     """, (antrag_id,))
#     kreditantrag_data = cursor.fetchone()
#     if kreditantrag_data:
#         print(f"   Antrag ID: {kreditantrag_data[0]}")
#         print(f"   ErstelltAm: {kreditantrag_data[1]}")
#         print(f"   Antragsstatus: {kreditantrag_data[2]}")
#         print(f"   Kreditbetrag: {kreditantrag_data[3]}")
#         print(f"   Laufzeit: {kreditantrag_data[4]}")
#         print(f"   Zinssatz: {kreditantrag_data[5]}")
#         print(f"   Status: {kreditantrag_data[6]}")
    
#     print("\nSample Dokument data:")
#     cursor.execute("""
#         SELECT [Dokumenten ID], [Dokumententyp], [Pfad DMS], [FK_Kreditantrag]
#         FROM dbo.Dokument WHERE [Dokumenten ID] = ?
#     """, (dokument_id,))
#     dokument_data = cursor.fetchone()
#     if dokument_data:
#         print(f"   Dokumenten ID: {dokument_data[0]}")
#         print(f"   Dokumententyp: {dokument_data[1]}")
#         print(f"   Pfad DMS: {dokument_data[2]}")
#         print(f"   FK_Kreditantrag: {dokument_data[3]}")
    
#     print("\nSample Extrahierte Daten:")
#     cursor.execute("""
#         SELECT [Feldname], [Wert], [Konfidenzscore]
#         FROM dbo.[Extrahierte Daten] WHERE [FK_Dokument] = ?
#         ORDER BY [Feldname]
#     """, (dokument_id,))
#     extrahierte_daten = cursor.fetchall()
#     for field_data in extrahierte_daten[:5]:  # Show first 5 fields
#         print(f"   {field_data[0]}: {field_data[1]} (confidence: {field_data[2]})")
    
# except Exception as e:
#     print(f"Validation queries failed: {e}")

# print("\nDatabase insertion completed successfully!")

In [26]:
# # Close database connection
# try:
#     conn.close()
#     print("Database connection closed")
# except Exception as e:
#     print(f"Error closing database connection: {e}")

# print("\n" + "="*60)
# print("SUMMARY")
# print("="*60)
# print(f"OCR Processing: {len(normalized_ocr_lines)} lines extracted")
# print(f"Field Extraction: {len(normalized_fields)} fields identified")
# print(f"Visualization: {visualization_path}")
# print(f"Database Insertion: Kreditantrag ID {antrag_id}, Dokument ID {dokument_id}")
# print(f"Extracted Fields: {extrahierte_daten_count if 'extrahierte_daten_count' in locals() else 'N/A'} fields stored")