In [None]:
import json, re, base64
from pathlib import Path
from typing import Optional
import requests

# ── Configuration ──────────────────────────────────────────────────
OPENAI_BASE_URL = "http://localhost:11434/v1"  # swap to your provider
OPENAI_API_KEY  = "ollama"                     # swap as needed
OPENAI_MODEL    = "qwen3:32b"                # swap as needed

def call_llm(prompt: str) -> str:
    """Call an OpenAI-compatible chat endpoint. `image` is base64-encoded."""
    content = []
    content.append({"type": "text", "text": prompt})

    resp = requests.post(
        f"{OPENAI_BASE_URL}/chat/completions",
        headers={"Authorization": f"Bearer {OPENAI_API_KEY}"},
        json={"model": OPENAI_MODEL, "messages": [{"role": "user", "content": content}], "stop": ["\n\n", "IN:"],
              "temperature": 0.1, "max_tokens": 4096},
        timeout=300,
    )
    resp.raise_for_status()
    return resp.json()["choices"][0]["message"]["content"]

print("✓ Config loaded")

✓ Config loaded


In [16]:
# ── Tier 1: Hard Baseline Metrics ─────────────────────────────────

def _levenshtein(a: str, b: str) -> int:
    """Standard DP Levenshtein distance."""
    n, m = len(a), len(b)
    dp = list(range(m + 1))
    for i in range(1, n + 1):
        prev, dp[0] = dp[0], i
        for j in range(1, m + 1):
            cur = dp[j]
            dp[j] = min(dp[j] + 1, dp[j - 1] + 1, prev + (0 if a[i - 1] == b[j - 1] else 1))
            prev = cur
    return dp[m]


def _normalize(text: str, lower: bool = False) -> str:
    t = re.sub(r'\s+', ' ', text).strip()
    return t.lower() if lower else t


def tier1_metrics(ground_truth: str, ocr_output: str, lower: bool = False) -> dict:
    """CER, WER, exact-match between ground truth and OCR output."""
    gt = _normalize(ground_truth, lower)
    ocr = _normalize(ocr_output, lower)

    cer = _levenshtein(gt, ocr) / max(len(gt), 1)
    gt_words, ocr_words = gt.split(), ocr.split()
    wer = _levenshtein(" ".join(gt_words), " ".join(ocr_words)) / max(len(" ".join(gt_words)), 1)
    # Word-level WER (token-based edit distance)
    wer_tok = _levenshtein_words(gt_words, ocr_words) / max(len(gt_words), 1)

    return {
        "input": ocr_output,
        "cer": round(cer, 4),
        "wer": round(wer, 4),
        "wer_token": round(wer_tok, 4),
        "exact_match": gt == ocr,
        "gt_chars": len(gt),
        "ocr_chars": len(ocr),
    }


def _levenshtein_words(a: list, b: list) -> int:
    """Levenshtein on word-token lists."""
    n, m = len(a), len(b)
    dp = list(range(m + 1))
    for i in range(1, n + 1):
        prev, dp[0] = dp[0], i
        for j in range(1, m + 1):
            cur = dp[j]
            dp[j] = min(dp[j] + 1, dp[j - 1] + 1, prev + (0 if a[i - 1] == b[j - 1] else 1))
            prev = cur
    return dp[m]

print("✓ Tier 1 functions defined")

✓ Tier 1 functions defined


In [None]:
# ── Tier 2+3: LLM Evaluation & Correction ────────────────────────

LLM_EVAL_PROMPT = """
# OCR Post-Processing

You are deciphering a corrupted text. The original meaning is hidden under OCR damage. Your job is to recover it.

## Rules
1. **The meaning already exists** — you are revealing it, not creating it. Never paraphrase or inject new ideas.
2. **Noise characters/words are common** — OCR inserts random letters, splits words, or merges them. Single letters or fragments that make no grammatical sense (like a stray "y" or "a") are likely noise or remnants of a real word.
3. **Corruption can be severe** — a word may be completely unrecognizable. Infer it from:
   - Grammar: What part of speech must fit here?
   - Context: What are the surrounding words talking about?
   - Letter traces: Any surviving letters that hint at the original? (e.g. "Hergy" → "therapy", "alio" → "aliz")
4. **Adjacent fragments often form one word** — look for split words that reconstruct when merged (e.g. "inter alio ing" → "internalizing").
5. **Preserve everything that isn't damaged** — keep original punctuation intent, sentence structure, and word order.

## Example
Input: The irony is, that most a y these people will be in Hergy, inter alio ing eve y thing
Output: The irony is, that most of these people will be in therapy, internalizing everything

Notice: "a y" → "of" (noise fragments replaced by contextually correct word), "Hergy" → "therapy" (letter traces + context), "inter alio ing" → "internalizing" (fragments merged), "eve y thing" → "everything" (noise letter removed).

## Input
to live and explore. The problem lies inrerally in most cases. The irony is, that most a y these people will be in Hergy, inter alio ing eve y thing but still reaching the con el usi on that the world is in pe fec and they s one how are not a rfa wu t.  God' s creations are per fed our minds, magni c tory, and oo con cio us nees are the imperfections.  The world and society are nor created by God, so Huy are due their crisis m bur the real sinners a eus, our minds jy ras ere and adam bitH he o pple. In fact, f uch the stra vau o, I should n' t hau e defended that, ir won' t my point. My point is that un l by s you are her miro ll y il for a child

## Output format
Respond with ONLY the corrected text. No preamble, no explanations, no labels, no markdown formatting. Just the clean corrected text and nothing else.
"""


def tier23_llm_eval(ocr_output: str) -> dict:
    """Single LLM call: evaluate + correct OCR output. Returns parsed dict."""
    prompt = LLM_EVAL_PROMPT.format(ocr_text=ocr_output)

    raw = call_llm(prompt)

    return raw

print("✓ Tier 2+3 functions defined")

✓ Tier 2+3 functions defined


In [18]:
# ── Full Evaluation Pipeline ──────────────────────────────────────

def evaluate_ocr(
    ocr_output: str,
    ground_truth: Optional[str] = None,
    lower: bool = False,
) -> dict:
    """
    Full OCR evaluation: hard metrics (if GT available) + LLM scoring/correction.
    Returns a single structured dict ready for JSON serialization.
    """
    result = {}

    # ── Tier 1 ────────────────────────────────────────────────────
    if ground_truth is not None:
        result["tier1_raw_vs_gt"] = tier1_metrics(ground_truth, ocr_output, lower)

    # ── Tier 2+3 ──────────────────────────────────────────────────
    llm_response = tier23_llm_eval(ocr_output)

    # ── Corrected vs GT (did post-processing help?) ───────────────
    if ground_truth is not None:
        result["tier1_corrected_vs_gt"] = tier1_metrics(ground_truth, llm_response, lower)
    return result


# ── Run it ─────────────────────────────────────────────────────────
# Fill these in with your data:
ocr_output = Path("../data/output/IMG_4737.txt").read_text(encoding="utf-8")

# Set to None if no ground truth is available
gt_path = Path("../data/output/IMG_4737_gt.md")
ground_truth = gt_path.read_text(encoding="utf-8") if gt_path.exists() else None

result = evaluate_ocr(ocr_output, ground_truth=ground_truth)
print("✓ Evaluation complete")

✓ Evaluation complete


In [19]:
result

{'tier1_raw_vs_gt': {'input': "to live and explore. The problem lies inrerally in most cases. The irony is, that most a y these people will be in Hergy, inter alio ing eve y thing but still reaching the con el usi on that the world is in pe fec and they s one how are not a rfa wu t.  God' s creations are per fed our minds, magni c tory, and oo con cio us nees are the imperfections.  The world and society are nor created by God, so Huy are due their crisis m bur the real sinners a eus, our minds jy ras ere and adam bitH he o pple. In fact, f uch the stra vau o, I should n' t hau e defended that, ir won' t my point. My point is that un l by s you are her miro ll y il for a child",
  'cer': 0.5636,
  'wer': 0.5636,
  'wer_token': 0.4807,
  'exact_match': False,
  'gt_chars': 1494,
  'ocr_chars': 652},
 'tier1_corrected_vs_gt': {'input': '',
  'cer': 1.0,
  'wer': 1.0,
  'wer_token': 1.0,
  'exact_match': False,
  'gt_chars': 1494,
  'ocr_chars': 0}}

In [None]:
# ── Summary ───────────────────────────────────────────────────────

def print_eval(r: dict):
    if "tier1_raw_vs_gt" in r:
        t = r["tier1_raw_vs_gt"]
        print("── Tier 1: Raw OCR vs Ground Truth ──")
        print(f"  CER  {t['cer']:.2%}   WER  {t['wer']:.2%}   WER(tok) {t['wer_token']:.2%}   Exact: {t['exact_match']}")

    s = r["scores"]
    print("\n── LLM Scores (original OCR) ──")
    for k, v in s.items():
        print(f"  {k}: {v}/100")

    cs = r["correction_stats"]
    print(f"\n── Corrections: {cs['total']} total  (avg conf {cs['avg_confidence']:.2f}) ──")
    for label, counts in [("Category", cs["by_category"]), ("Severity", cs["by_severity"])]:
        print(f"  {label}: {counts}")

    if "tier1_corrected_vs_gt" in r:
        t2 = r["tier1_corrected_vs_gt"]
        print("\n── Tier 1: Corrected vs Ground Truth ──")
        print(f"  CER  {t2['cer']:.2%}   WER  {t2['wer']:.2%}   WER(tok) {t2['wer_token']:.2%}   Exact: {t2['exact_match']}")
        if "tier1_raw_vs_gt" in r:
            delta_cer = r["tier1_raw_vs_gt"]["cer"] - t2["cer"]
            delta_wer = r["tier1_raw_vs_gt"]["wer"] - t2["wer"]
            print(f"  Δ CER {delta_cer:+.2%}   Δ WER {delta_wer:+.2%}  {'(improved)' if delta_cer > 0 else '(worse)'}")

    print(f"\n── Summary ──\n  {r['summary']}")
    print(f"\n── Corrected Text (first 500 chars) ──\n{r['corrected_text'][:500]}")

print_eval(result)

# Full JSON for programmatic use
# print(json.dumps(result, indent=2, ensure_ascii=False))