<a href="https://colab.research.google.com/github/ocr-workspace/Ocr-model-testing-on-Scanned-legal-pdf/blob/main/Scanned_pdf_legal.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets evaluate pdf2image pytesseract  doctr transformers

**Dataset Loading**

In [None]:
from datasets import load_dataset

ds_legal = load_dataset("nielsr/funsd")

print(ds_legal)

**Ground Truth Builder Function**

In [None]:
def build_gt_funsd(sample):
    words = sample["words"]
    text = " ".join(words)
    return text.strip()

In [None]:
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"

**Evalaution Matrix Setup**

In [None]:
!pip install jiwer

In [None]:
import evaluate
import numpy as np
import time
import pandas as pd

cer_metric = evaluate.load("cer")
wer_metric = evaluate.load("wer")

**Runner Function**

In [None]:
import time
import cv2
from PIL import Image
import pandas as pd
import numpy as np
def benchmark_legal(ds, ocr_function, model_name, samples=50):

    results = []

    for i in range(samples):

        sample = ds["train"][i]
        image = sample["image"]
        gt_text = build_gt_funsd(sample)

        start = time.time()
        pred_text = ocr_function(image)
        end = time.time()

        cer = cer_metric.compute(
            predictions=[pred_text],
            references=[gt_text]
        )

        wer = wer_metric.compute(
            predictions=[pred_text],
            references=[gt_text]
        )

        word_dev = abs(
            len(pred_text.split()) - len(gt_text.split())
        )

        results.append({
            "CER": cer,
            "WER": wer,
            "Word_Count_Deviation": word_dev,
            "Time": end - start
        })

    df = pd.DataFrame(results)

    print(f"\n===== {model_name} (Legal OCR) =====")
    print("Mean CER:", df["CER"].mean())
    print("Mean WER:", df["WER"].mean())
    print("Avg Word Count Deviation:", df["Word_Count_Deviation"].mean())
    print("Avg Time:", df["Time"].mean())

    return df

**Tesseract Function**      

In [None]:
!apt-get install -y tesseract-ocr
!pip install pytesseract

In [None]:
import pytesseract

def tesseract_ocr(image):

    if isinstance(image, Image.Image):
        image = np.array(image)

    text = pytesseract.image_to_string(image)
    return text.strip()

**Paddle Ocr **

In [None]:
!pip install paddlepaddle-gpu

In [None]:
# Force-uninstall the broken numpy
!pip uninstall numpy -y

# Install the packages but strictly lock numpy at 1.26.4 simultaneously
!pip install numpy==1.26.4 paddleocr==2.8.1 evaluate doctr

In [None]:
# Uninstall langchain to bypass the PaddleX import bug
# This will NOT touch your numpy or paddlepaddle installations!
!pip uninstall langchain langchain-community langchain-core -y

**Model Loading**

In [None]:
!pip list | grep -E "numpy|paddle|evaluate"

evaluate                                 0.4.6
numpy                                    1.26.4
paddleocr                                2.8.1
paddlepaddle-gpu                         2.6.2


In [None]:
from paddleocr import PaddleOCR

paddle_model = PaddleOCR(
    use_angle_cls=True,
    lang='en',
    use_gpu=True
)

Function

In [None]:
def paddle_ocr(image):

    if isinstance(image, Image.Image):
        image = np.array(image)

    result = paddle_model.ocr(image)

    text = ""
    for line in result:
        for word_info in line:
            text += word_info[1][0] + " "

    return text.strip()

**DocTR**

Model Loading

In [None]:
 !pip install python-doctr

In [None]:
from doctr.models import ocr_predictor
import torch

# 1. Dynamically check for a GPU (CUDA). If not found, use the CPU.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Loading model onto: {device}")

# 2. Load the docTR model and send it to the chosen device
doctr_model = ocr_predictor(pretrained=True).to(device)

print("docTR model loaded successfully!")

Loading model onto: cuda
docTR model loaded successfully!


* Function

In [None]:
def doctr_ocr(image):

    if isinstance(image, Image.Image):
        image = np.array(image)

    if len(image.shape) == 2:
        image = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB)

    result = doctr_model([image])

    text = ""
    for page in result.pages:
        for block in page.blocks:
            for line in block.lines:
                for word in line.words:
                    text += word.value + " "

    return text.strip()

**Hybrid (Paddle Detection + TrOCR Recognition)**

In [None]:
!pip install transformers

* Load TrOCR (Printed)

In [None]:
from transformers import TrOCRProcessor, VisionEncoderDecoderModel

processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed")
model_trocr = VisionEncoderDecoderModel.from_pretrained(
    "microsoft/trocr-base-printed"
).to(device)

* Hybrid Function

In [None]:
def hybrid_ocr(image):

    if isinstance(image, Image.Image):
        image = np.array(image)

    result = paddle_model.ocr(image)

    boxes = []
    for line in result:
        for word_info in line:
            boxes.append(word_info[0])

    boxes = sorted(boxes, key=lambda x: (x[0][1], x[0][0]))

    recognized_words = []

    for box in boxes:

        pts = np.array(box).astype(int)

        x_min = max(0, min(pts[:,0]))
        y_min = max(0, min(pts[:,1]))
        x_max = max(0, max(pts[:,0]))
        y_max = max(0, max(pts[:,1]))

        crop = image[y_min:y_max, x_min:x_max]

        if crop.size == 0:
            continue

        #  VERY IMPORTANT FIX
        if len(crop.shape) == 2:
            crop = cv2.cvtColor(crop, cv2.COLOR_GRAY2RGB)
        elif crop.shape[2] == 4:
            crop = cv2.cvtColor(crop, cv2.COLOR_RGBA2RGB)

        crop = cv2.resize(crop, None, fx=2, fy=2)

        # Convert to PIL (TrOCR prefers PIL)
        crop_pil = Image.fromarray(crop)

        pixel_values = processor(
            images=crop_pil,
            return_tensors="pt"
        ).pixel_values.to(device)

        generated_ids = model_trocr.generate(pixel_values)

        text = processor.batch_decode(
            generated_ids,
            skip_special_tokens=True
        )[0]

        recognized_words.append(text)

    return " ".join(recognized_words)

In [None]:
df_tess = benchmark_legal(ds_legal, tesseract_ocr, "Tesseract")


===== Tesseract (Legal OCR) =====
Mean CER: 0.5772690521505756
Mean WER: 0.7157845192522629
Avg Word Count Deviation: 18.56
Avg Time: 0.884716830253601


In [None]:
df_paddle = benchmark_legal(ds_legal, paddle_ocr, "PaddleOCR")

[2026/02/26 04:32:39] ppocr DEBUG: dt_boxes num : 29, elapsed : 0.9679949283599854
[2026/02/26 04:32:39] ppocr DEBUG: cls num  : 29, elapsed : 0.16066360473632812
[2026/02/26 04:32:39] ppocr DEBUG: rec_res num  : 29, elapsed : 0.2750120162963867
[2026/02/26 04:32:39] ppocr DEBUG: dt_boxes num : 113, elapsed : 0.07697701454162598
[2026/02/26 04:32:40] ppocr DEBUG: cls num  : 113, elapsed : 0.3188467025756836
[2026/02/26 04:32:40] ppocr DEBUG: rec_res num  : 113, elapsed : 0.32578086853027344
[2026/02/26 04:32:40] ppocr DEBUG: dt_boxes num : 42, elapsed : 0.046300411224365234
[2026/02/26 04:32:40] ppocr DEBUG: cls num  : 42, elapsed : 0.05474567413330078
[2026/02/26 04:32:40] ppocr DEBUG: rec_res num  : 42, elapsed : 0.1426393985748291
[2026/02/26 04:32:41] ppocr DEBUG: dt_boxes num : 150, elapsed : 0.08202791213989258
[2026/02/26 04:32:41] ppocr DEBUG: cls num  : 150, elapsed : 0.19369125366210938
[2026/02/26 04:32:41] ppocr DEBUG: rec_res num  : 150, elapsed : 0.39368319511413574
[2026

In [None]:
df_doctr = benchmark_legal(ds_legal, doctr_ocr, "DocTR")


===== DocTR (Legal OCR) =====
Mean CER: 0.4750372750721484
Mean WER: 0.6198433987584714
Avg Word Count Deviation: 10.38
Avg Time: 0.29754053115844725


In [None]:
df_hybrid = benchmark_legal(ds_legal, hybrid_ocr, "Hybrid")

[2026/02/26 04:46:48] ppocr DEBUG: dt_boxes num : 29, elapsed : 0.05209517478942871
[2026/02/26 04:46:49] ppocr DEBUG: cls num  : 29, elapsed : 0.23719406127929688
[2026/02/26 04:46:49] ppocr DEBUG: rec_res num  : 29, elapsed : 0.16016411781311035
[2026/02/26 04:46:55] ppocr DEBUG: dt_boxes num : 113, elapsed : 0.0753326416015625
[2026/02/26 04:46:56] ppocr DEBUG: cls num  : 113, elapsed : 0.14495444297790527
[2026/02/26 04:46:56] ppocr DEBUG: rec_res num  : 113, elapsed : 0.3207263946533203
[2026/02/26 04:47:08] ppocr DEBUG: dt_boxes num : 42, elapsed : 0.045534610748291016
[2026/02/26 04:47:08] ppocr DEBUG: cls num  : 42, elapsed : 0.0556187629699707
[2026/02/26 04:47:09] ppocr DEBUG: rec_res num  : 42, elapsed : 0.14213156700134277
[2026/02/26 04:47:14] ppocr DEBUG: dt_boxes num : 150, elapsed : 0.07756328582763672
[2026/02/26 04:47:14] ppocr DEBUG: cls num  : 150, elapsed : 0.1912078857421875
[2026/02/26 04:47:15] ppocr DEBUG: rec_res num  : 150, elapsed : 0.3940284252166748
[2026/