In [14]:
import torch

if torch.cuda.is_available():
    print("CUDA is available! Using GPU.")
    print(f"Number of GPUs: {torch.cuda.device_count()}")
    print(f"Current GPU: {torch.cuda.get_device_name(0)}") # Prints the name of the first GPU
else:
    print("CUDA is NOT available. Using CPU.")

CUDA is NOT available. Using CPU.


In [9]:
import cv2
import numpy as np
from PIL import Image

def extract_lines(image_path):
    image = cv2.imread(image_path)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    _, binary = cv2.threshold(gray, 127, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)

    # Dilation to join characters into lines
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 5))
    dilated = cv2.dilate(binary, kernel, iterations=1)

    # Find contours (lines)
    contours, _ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    
    lines = []
    for cnt in sorted(contours, key=lambda c: cv2.boundingRect(c)[1]):  # Sort top to bottom
        x, y, w, h = cv2.boundingRect(cnt)
        line_img = image[y:y+h, x:x+w]
        pil_img = Image.fromarray(cv2.cvtColor(line_img, cv2.COLOR_BGR2RGB))
        lines.append(pil_img)
    return lines


In [10]:
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
import torch

processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def ocr_line(image):
    pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(device)
    generated_ids = model.generate(pixel_values)
    text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return text


Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-base-handwritten and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
image_path = "test.jpg"
lines = extract_lines(image_path)

all_text = []
for i, line_img in enumerate(lines):
    text = ocr_line(line_img)
    all_text.append(text)
    print(f"Line {i+1}: {text}")

# Full page text
full_text = "\n".join(all_text)
print("\nFull OCR Output:\n", full_text)
# 7 minutes


Line 1: 0
Line 2: 0
Line 3: 0 0
Line 4: to
Line 5: wimmerman
Line 6: of Partnership .
Line 7: appropriate
Line 8: 0 0
Line 9: 0
Line 10: 0
Line 11: po
Line 12: 0.
Line 13: ww
Line 14: a
Line 15: 0.
Line 16: y
Line 17: se
Line 18: 0 0
Line 19: 0
Line 20: 9
Line 21: l
Line 22: 1961 American film director of the American film
Line 23: 0
Line 24: 0
Line 25: 0 0
Line 26: 0 0
Line 27: 0 0
Line 28: 0 0
Line 29: 0
Line 30: 0 0
Line 31: 0 0
Line 32: 0 0
Line 33: 0 0
Line 34: 0 0
Line 35: 0 0
Line 36: 0 0
Line 37: 0 0
Line 38: be
Line 39: a
Line 40: al
Line 41: lovesi
Line 42: pert
Line 43: estals
Line 44: 19.
Line 45: exti
Line 46: tisher .
Line 47: nags )
Line 48: itho
Line 49: 1942 43
Line 50: to
Line 51: 0
Line 52: 0 .
Line 53: l
Line 54: wt .
Line 55: moxy
Line 56: 0 0
Line 57: nos .
Line 58: 0
Line 59: p.
Line 60: l
Line 61: s
Line 62: l
Line 63: co
Line 64: 0 0
Line 65: 0 0
Line 66: ciny
Line 67: 0
Line 68: 99
Line 69: l
Line 70: 0 0
Line 71: 0 0
Line 72: 0 0
Line 73: 0 0
Line 74: 0 0
Lin

In [16]:
import cv2
import numpy as np
from PIL import Image
from transformers import TrOCRProcessor, VisionEncoderDecoderModel

# Load TrOCR
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")

# Load and preprocess image using OpenCV
image_path = "test2.png"
img = cv2.imread(image_path)
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

# Threshold to binary image
_, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)

# Dilation to group text lines
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 5))
dilated = cv2.dilate(binary, kernel, iterations=3)

# Find contours (each line/block)
contours, _ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

# Sort contours from top to bottom
sorted_contours = sorted(contours, key=lambda ctr: cv2.boundingRect(ctr)[1])

all_text = []

for i, contour in enumerate(sorted_contours):
    x, y, w, h = cv2.boundingRect(contour)
    roi = img[y:y+h, x:x+w]

    # Convert to PIL image
    pil_image = Image.fromarray(cv2.cvtColor(roi, cv2.COLOR_BGR2RGB))

    # Use TrOCR to recognize
    pixel_values = processor(images=pil_image, return_tensors="pt").pixel_values
    generated_ids = model.generate(pixel_values)
    line_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

    all_text.append(line_text)

# Combine all lines
full_text = "\n".join(all_text)
print("🔍 Full Page Text Output:\n")
print(full_text)


Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-base-handwritten and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


🔍 Full Page Text Output:

0
0
0
0
b.
d
s
a
0
il
il
il
0
t
t
e
s
a
a
a
e
c
11
e
il
e
2
0
Delegates from Mr. Kenneth Kaunda's United National Independence Party ( 280,000
d
d
il
y
d
h.
fir .
id
ip
il
n
tu
k
ik
ivi
0
0
0
0
8
2.
t
t
t
t
t
y
ir.
a
e
c.
11
e
ill
e
ip
e
il
a
11
0
a
e
11
s
a
il
ll
a
e
il
11
e
ir.
ill
o
s
e
a
is
e
e
y
0
l
ii
9
sp
il
il
ii
ii
b.
k
d
il
fir .
y
il
in
h.
ivi
0
0
0
0
c
n
a
t
0
4.
t
e
e
ill
w.
s
s
e
ir.
g.
11
0
11
0
a
11
a
c.
s
a
ll
ill
ll
y
ir.
ir.
a
ir.
11
a
s
ir.
e
ill
e
ill
a
exjoy
0
ii
ii
d
il
ii
ii
ii
d
cl
d
il
if
il
r.
s
t.
t
t.
t.
t
0
e
s
o
ip
ip
o
y
11
e
o
v.
s
y
0
10.
11
a
e
c.
11
0
ill
ill
o
c.
a
s
s
ll
c.
s
0
ay
o
11
o
11
o
11
0
c.
0
s
ir.
ll
o
0
il
ii
ii
ii
i
ej
ii
ii
fir .
g.
il
h.
b.
d
h.
h.
d
i
a
c
r.
n
t
t
t
t
t
t
t
t
joy
ir.
a
e
ir.
c.
e
a
il
o
o
e
ll
s
11
y
ir.
a
inn
e
e
11
a
g.
11
e
g.
s
11
a
c.
a
e
0
11
ir.
e
ir.
o
o
c.
e
my
ii
ii
ii
ii
ii
ii
h.
f.
il
cl
il
ivi
ivi-
il
t
e
g.
il
a
c
0
joy
c.
o
ip
a
il
o
g.
il
s
s
11
s
0
e
c.
a
11
a
10.
0
y
0
190