<a href="https://colab.research.google.com/github/ocr-workspace/Ocr-model-testing-on-Scanned-legal-pdf/blob/main/legal_longform_Scanned.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Install**

In [None]:
!pip install numpy==1.24.4
!pip install pdf2image pdfminer.six evaluate pytesseract
!pip install paddlepaddle-gpu==2.6.2 paddleocr==2.7.0.3
!pip install python-doctr transformers fpdf

In [None]:
!apt-get update
!apt-get install -y poppler-utils

In [None]:
# Uninstall langchain to bypass the PaddleX import bug
# This will NOT touch your numpy or paddlepaddle installations!
!pip uninstall langchain langchain-community langchain-core -y

In [None]:
!pip list | grep -E "numpy|paddle|evaluate"

* Imports

In [None]:
import os
import numpy as np
import pandas as pd
import torch
import cv2
import time

from PIL import Image
from fpdf import FPDF
from pdf2image import convert_from_path
from datasets import load_dataset
import evaluate

* Load FUNSD Dataset

In [None]:
ds_legal = load_dataset("nielsr/funsd")

print(ds_legal)

* Ground Truth Builder

In [None]:
def build_gt_with_lines(sample):

    words = sample["words"]
    bboxes = sample["bboxes"]

    lines = []
    current_line = []
    last_y = None

    for word, box in sorted(zip(words, bboxes), key=lambda x: (x[1][1], x[1][0])):

        y = box[1]

        if last_y is None:
            current_line.append(word)
            last_y = y
            continue

        if abs(y - last_y) > 10:
            lines.append(" ".join(current_line))
            current_line = [word]
        else:
            current_line.append(word)

        last_y = y

    if current_line:
        lines.append(" ".join(current_line))

    return "\n".join(lines)

* Create 10 Multi-Page PDFs

In [None]:
def create_multi_page_pdfs(ds, num_docs=10, pages_per_doc=10):

    gt_store = {}

    for d in range(num_docs):

        pdf = FPDF()
        combined_gt = []

        for i in range(pages_per_doc):

            idx = d * pages_per_doc + i
            sample = ds["train"][idx]

            image = sample["image"]
            img_path = f"temp_{d}_{i}.png"
            image.save(img_path)

            pdf.add_page()
            pdf.image(img_path, x=0, y=0, w=210)

            combined_gt.append(build_gt_with_lines(sample))

        filename = f"doc_{d}.pdf"
        pdf.output(filename)

        gt_store[filename] = "\n\n".join(combined_gt)

    return gt_store

In [None]:
gt_documents = create_multi_page_pdfs(ds_legal)

Covert pdf to image

In [None]:
def pdf_to_images(pdf_path):
    return convert_from_path(pdf_path, dpi=300)

Evalaution matrix

In [None]:
!pip install jiwer

In [None]:
cer_metric = evaluate.load("cer")
wer_metric = evaluate.load("wer")

In [None]:
def structural_metrics(gt_text, pred_text):

    # Word count deviation
    gt_words = len(gt_text.split())
    pred_words = len(pred_text.split())
    word_dev = abs(gt_words - pred_words)

    # Line preservation
    gt_lines = gt_text.split("\n")
    pred_lines = pred_text.split("\n")

    gt_line_count = len(gt_lines)
    pred_line_count = len(pred_lines)

    if gt_line_count == 0:
        line_ratio = 0
    else:
        line_ratio = min(pred_line_count, gt_line_count) / gt_line_count

    # Collapse detection (if OCR collapsed into <=3 lines)
    collapse_flag = 1 if pred_line_count <= 3 else 0

    return {
        "Word_Count_Deviation": word_dev,
        "GT_Lines": gt_line_count,
        "Pred_Lines": pred_line_count,
        "Line_Preservation_Ratio": line_ratio,
        "Collapsed": collapse_flag
    }

Setup OCR Models

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


**Tesseract**

In [None]:
import pytesseract

def tesseract_ocr(image):
    return pytesseract.image_to_string(image)

**PaddleOcr**

In [None]:
!pip install paddlepaddle-gpu

In [None]:
# Install the packages but strictly lock numpy at 1.26.4 simultaneously
!pip install numpy==1.26.4 paddleocr==2.8.1 evaluate doctr

In [None]:
import numpy as np

def paddle_ocr_structured(image):

    image_np = np.array(image)
    result = paddle_model.ocr(image_np)

    words = []

    for line in result:
        for word_info in line:
            box = word_info[0]
            text = word_info[1][0]

            y_center = np.mean([pt[1] for pt in box])
            x_left = np.mean([pt[0] for pt in box])

            words.append({
                "text": text,
                "y": y_center,
                "x": x_left
            })

    # Sort by vertical position
    words = sorted(words, key=lambda w: (w["y"], w["x"]))

    # Remove duplicate words (overlapping y and similar text)
    filtered_words = []
    seen = set()

    for w in words:
        key = (round(w["y"], 1), round(w["x"], 1), w["text"])
        if key not in seen:
            filtered_words.append(w)
            seen.add(key)

    words = filtered_words

    # Cluster into rows
    lines = []
    current_line = []
    last_y = None

    for word in words:

        if last_y is None:
            current_line.append(word)
            last_y = word["y"]
            continue

        if abs(word["y"] - last_y) > 15:  # row threshold
            line_text = " ".join([w["text"] for w in current_line])
            lines.append(line_text)
            current_line = [word]
        else:
            current_line.append(word)

        last_y = word["y"]

    if current_line:
        lines.append(" ".join([w["text"] for w in current_line]))

    return "\n".join(lines)

**DocTr**

In [None]:
from doctr.models import ocr_predictor

doctr_model = ocr_predictor(pretrained=True).to(device)

def doctr_ocr_structured(image):

    result = doctr_model([np.array(image)])

    full_text = ""

    for page in result.pages:
        for block in page.blocks:
            for line in block.lines:
                line_text = " ".join([word.value for word in line.words])
                full_text += line_text + "\n"

    return full_text

**Benchmark Function**

In [None]:
def evaluate_model_on_documents(gt_documents, ocr_function, model_name):

    results = []

    for pdf_name, gt_text in gt_documents.items():

        pages = pdf_to_images(pdf_name)

        full_pred_text = ""
        start = time.time()

        for page in pages:
            full_pred_text += ocr_function(page) + "\n"

        end = time.time()

        # Basic metrics
        cer = cer_metric.compute(
            predictions=[full_pred_text],
            references=[gt_text]
        )

        wer = wer_metric.compute(
            predictions=[full_pred_text],
            references=[gt_text]
        )

        # Structural metrics
        struct = structural_metrics(gt_text, full_pred_text)

        results.append({
            "Document": pdf_name,
            "CER": cer,
            "WER": wer,
            "Word_Count_Deviation": struct["Word_Count_Deviation"],
            "Line_Preservation_Ratio": struct["Line_Preservation_Ratio"],
            "Collapsed": struct["Collapsed"],
            "Time": end - start
        })

    df = pd.DataFrame(results)

    print(f"\n===== {model_name} Results =====")
    print("Mean CER:", df["CER"].mean())
    print("Mean WER:", df["WER"].mean())
    print("Avg Word Count Deviation:", df["Word_Count_Deviation"].mean())
    print("Line Preservation Ratio:", df["Line_Preservation_Ratio"].mean())
    print("Collapse %:", df["Collapsed"].mean() * 100)
    print("Avg Time:", df["Time"].mean())

    return df

In [None]:
df_tess = evaluate_model_on_documents(gt_documents, tesseract_ocr, "Tesseract")


===== Tesseract Results =====
Mean CER: 0.5963400905668071
Mean WER: 0.8647969541168979
Avg Word Count Deviation: 79.4
Line Preservation Ratio: 1.0
Collapse %: 0.0
Avg Time: 25.488605093955993


In [None]:
df_paddle_struct = evaluate_model_on_documents(
    gt_documents,
    paddle_ocr_structured,
    "PaddleOCR Structured"
)

[2026/02/26 07:22:52] ppocr DEBUG: dt_boxes num : 30, elapsed : 0.04185342788696289
[2026/02/26 07:22:52] ppocr DEBUG: cls num  : 30, elapsed : 0.040070533752441406
[2026/02/26 07:22:52] ppocr DEBUG: rec_res num  : 30, elapsed : 0.18963170051574707
[2026/02/26 07:22:52] ppocr DEBUG: dt_boxes num : 115, elapsed : 0.06508564949035645
[2026/02/26 07:22:52] ppocr DEBUG: cls num  : 115, elapsed : 0.1487736701965332
[2026/02/26 07:22:52] ppocr DEBUG: rec_res num  : 115, elapsed : 0.3371303081512451
[2026/02/26 07:22:53] ppocr DEBUG: dt_boxes num : 44, elapsed : 0.04319167137145996
[2026/02/26 07:22:53] ppocr DEBUG: cls num  : 44, elapsed : 0.059000492095947266
[2026/02/26 07:22:53] ppocr DEBUG: rec_res num  : 44, elapsed : 0.14039182662963867
[2026/02/26 07:22:53] ppocr DEBUG: dt_boxes num : 168, elapsed : 0.07885622978210449
[2026/02/26 07:22:53] ppocr DEBUG: cls num  : 168, elapsed : 0.21531939506530762
[2026/02/26 07:22:54] ppocr DEBUG: rec_res num  : 168, elapsed : 0.4004671573638916
[20

In [None]:
df_doctr_struct = evaluate_model_on_documents(
    gt_documents,
    doctr_ocr_structured,
    "DocTR Structured"
)


===== DocTR Structured Results =====
Mean CER: 0.48094838210442176
Mean WER: 0.78336259351026
Avg Word Count Deviation: 90.1
Line Preservation Ratio: 1.0
Collapse %: 0.0
Avg Time: 4.04759452342987
